Packages
library(tidyverse) #Includes Dplyr & GGplot
library(Cairo) # Anti aliased graphics
library(patchwork) # simplified arrangement of graphs
library(gmodels) # CrossTable()
library(corrr) # correlation functions
library(ggm) # some functions for partial correlation
library(car) # vif function
library(knitr) # for knitting rmarkdown
library(data.table) # less metadata than tibbles
library(mice) # for multiple imputation
library(mitools) # for multiple imputations
library(binom) # binom.confint()
library(gridExtra) # for graphical arrangement
Knitr
Settings
knitr::opts_chunk$set(echo = TRUE, # sets default for entire doc
# replaces default rendering with Cairo's AA
dev.args = list(png = list(type = "cairo")),
# rendering res
dpi = 300)
Load
Datasets
titanic.lb <- read.csv('Kaggle/titanicleaderboard.csv',
header = TRUE) # leaderboard database
titanic <- as_tibble(read.csv('Kaggle/train_titanic.csv',
header = TRUE)) # main database, converted to tibble
titanic.test <- read.csv('Kaggle/test_titanic.csv',
header = TRUE) # test database
# Change Tibble numeric variable width.
# Useful when printed tibble shows fewer decimals than required.
more_tib <- function(tbl, sigfig = 3) {
global_sigfig <- getOption("pillar.sigfig") # Stores global setting
options(pillar.sigfig = sigfig) # changes to desired value
print(tbl)
on.exit(options(pillar.sigfig = global_sigfig)) # Resets after printing
}
# Computes Pseudo R^2s. (Andy Field's function)
log_pseudoR2s <- function(LogModel) {
dev <- LogModel$deviance
nullDev <- LogModel$null.deviance
modelN <- length(LogModel$fitted.values)
R.l <- 1 - dev / nullDev
R.cs <- 1- exp ( -(nullDev - dev) / modelN)
R.n <- R.cs / ( 1 - ( exp (-(nullDev / modelN))))
cat("Pseudo R^2 for logistic regression\n")
cat("Hosmer and Lemeshow R^2 ", round(R.l, 3), "\n")
cat("Cox and Snell R^2 ", round(R.cs, 3), "\n")
cat("Nagelkerke R^2 ", round(R.n, 3), "\n")
}
# Takes a regression model, the dataset, and columns of interest.
# Returns a new dataset which retains cols of interest + diagnostic data
model_tib <- function(model, data, ...) {
cols <- quos(...) # stores cols for use within select() [c() didn't work]
tibble_name <- # Create new tibble name in format "modelname_tib"
paste0(deparse(substitute(model)), "_tib")
data %>%
select(!!!cols) %>% # splices stored data back into usable arguments
mutate(predicted.prob = fitted(model), # diagnostic data
standardized.res = rstandard(model),
studentized.res = rstudent(model),
dfbeta = dfbeta(model),
dffit = dffits(model),
cookd = cooks.distance(model),
leverage = hatvalues(model)) %>%
assign(tibble_name, ., envir = .GlobalEnv) %>% # assign name
view()
}
### Computes Wald 95% CI
wald_95ci <- function(successes,trials){
prop <- successes/trials
# Standard Error
se <- sqrt((prop*(1-prop))/trials)
# 95% CI
Lower.CI <- (prop - 1.96*se)
Upper.CI <- (prop + 1.96*se)
results <- c(Mean = prop, Lower = Lower.CI, Upper = Upper.CI)
return(results)
}
# Computes Wilson 95% CI
wilson_95ci <- function(successes,trials) {
prop <- successes/trials
# Standard Error
se <- sqrt((prop*(1-prop)/trials) + (1.96^2)/(4*(trials^2)))
# Adjust Proportion for Small Samples
prop.adj <- prop + (1.96^2)/(2*trials)
# 95% CI Step 1: Calculate numerator
Lower.CI <- (prop.adj - 1.96*se) / (1 + (1.96^2)/trials)
Upper.CI <- (prop.adj + 1.96*se) / (1 + (1.96^2)/trials)
results <- c(Mean = prop, Lower = Lower.CI, Upper = Upper.CI)
print(results)
}
Before engaging with the main data set, I wish to preview the
leaderboard to understand what the distribution of scores looks like,
and what to aim for.
The gender_submission.csv file provided by Kaggle as an example,
which predicts survivability based exclusively on gender, has a score of
0.76555. Any model I design must improve on this baseline score.
The variables of interest in the titanic leaderboard dataset are ‘Score’
and ‘SubmissionCount’. I want to look at the overall score distribution,
as well as the typical expected score for any given amount of
submissions.
summary(titanic.lb[,c("Score","SubmissionCount")])
# Q-Q Plot of Score
titanic.lb %>%
ggplot(aes(sample=Score)) +
stat_qq() +
stat_qq_line() +
scale_y_continuous(breaks = c(0, .25, .50, .75, .80, 1)) +
labs(x = "Theoretical Quantiles",
y = "Score Quantiles") +
theme_bw() -> qq1
# Histogram of Score
titanic.lb %>%
ggplot(aes(Score)) +
geom_histogram() +
stat_bin(geom = 'text', #
vjust = -0.5,
size = 3,
aes(y = after_stat(count),
label = after_stat(count))) +
labs(x = "Score",
y = "Count") +
theme_bw() -> hist1
qq1 + hist1 # requires patchwork, otherwise returns an error
## Score SubmissionCount
## Min. :0.0000 Min. : 1.000
## 1st Qu.:0.7631 1st Qu.: 1.000
## Median :0.7751 Median : 1.000
## Mean :0.7510 Mean : 3.743
## 3rd Qu.:0.7775 3rd Qu.: 4.000
## Max. :1.0000 Max. :228.000
The Q-Q Plot and Histogram for variable Score show that the distribution
of scores for the Titanic competition is nowhere near normal.
Most scores fall roughly between 0.75 and 0.80. There’s two interesting
oddities in the bin counts at 0.50 and 1.00, but it’s beyond the scope
here to investigate what might cause them. The value at 0.50 is
dismissed due to being lower than our baseline of 0.76555. As for the
perfect 1.00, given the results are made public, it’s not a stretch to
suggest some users have merely copied the file and reposted it as their
own.
Next I’ll sort the data to identify the most common scores. My suspicion
is that the peak of 10157 observations is being caused by an
over-representation of the 0.76555 score, since that’s the
gender_submission file provided by Kaggle. I want these tutorial
submissions to be excluded.
sort(table(titanic.lb$Score), decreasing = TRUE)[1:5] # Top 5 most common scores
##
## 0.77511 0.76555 0.53349 0.77751 0.78229
## 5254 1336 923 889 611
Although I was correct in assuming the 0.76555 score is over represented
in the data, the peak is actually being caused by a different value, the
score 0.77511. At a count of 5254, that’s nearly 1/3 of the entire
dataset!
This count is too large to be ignored. Reading further through the
Titanic Kaggle webpage, I found it also includes a tutorial on how to
create a random forest model. I ran and submitted the code and sure
enough it returns a 0.77511 score.
I’ll therefore exclude all 0.77511 and 0.76555 scores. Although it’s
possible that some gender model submissions are genuine attempts, and
also possible that these two specific scores can be obtained through
different models, I have no way of isolating such attempts from the
tutorial submissions and feel confident that entirely excluding them
will still result in a better representation of genuine submissions than
leaving them in.
# exclude all 0.76555 and 0.77511
lb.subset <- titanic.lb %>%
filter(!(Score %in% c(0.76555,0.77511))
& Score >= 0.70
& Score <= 0.82)
# Histogram of Scores between 0.700 and 0.825
lb.subset %>%
ggplot(aes(Score)) +
geom_histogram(bins = 20) +
geom_text(stat = "bin", # setting bin count above bins
bins = 20,
aes(label = after_stat(count)),
vjust = -0.5,
size = 3) +
scale_x_continuous(breaks = seq(0.700, 0.825, by = 0.0125)) +
geom_vline(xintercept = 0.76555, # gender_submission.csv baseline score
linetype = "dashed", color = "firebrick1", linewidth = 1) +
annotate("text",
x = 0.76555,
y = Inf, label = "Gender model\nbaseline score",
hjust = 1.1, vjust = 1.1, color = "firebrick1") +
labs(x = "Score", y = "Count") +
theme_bw()
Given this distribution, my goal is to have a score higher than 0.7875
by the time all variables have been considered.
I would also like to know how the submission count relates to this
subset of scores:
# Table of Counts per Submission
table(lb.subset$SubmissionCount)
# Scatter Plot of Score x Submission Count
lb.subset %>%
ggplot(aes(Score, SubmissionCount)) +
geom_point()
# Plot per Average Score at each Submission Count
avg.score.per.count <- lb.subset %>%
group_by(SubmissionCount) %>%
summarise(Avg_Score = mean(Score))
avg.score.per.count %>%
ggplot(aes(SubmissionCount, Avg_Score)) +
geom_smooth(method = 'lm',
formula = y ~ log(x), # natural logarithm of x
se = F,
colour = 'firebrick2') +
geom_point(colour = 'grey30') +
labs(x = "Submission Count", y = "Average Score") +
geom_point(aes(x = 20, y = 0.7874629), color = "red", size = 2) +
theme_bw()
lm_titanic.lb.score <- lm(Avg_Score ~ log(SubmissionCount), data = avg.score.per.count)
summary(lm_titanic.lb.score)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 2987 1300 664 516 421 301 230 223 191 339 134 110 97 65 79 57
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
## 41 54 47 86 26 25 16 17 18 20 18 12 17 18 15 7
## 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 7 7 5 11 5 1 11 10 3 9 8 1 4 3 2 6
## 49 50 51 52 53 54 55 56 57 58 61 62 63 64 66 67
## 5 2 1 4 5 2 1 1 2 1 2 1 1 2 3 2
## 68 70 71 74 77 79 80 81 83 85 93 99 103 112 113 122
## 1 1 1 3 1 1 1 2 1 1 1 1 1 1 1 1
## 228
## 1
##
## Call:
## lm(formula = Avg_Score ~ log(SubmissionCount), data = avg.score.per.count)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.011698 -0.002601 0.000270 0.001504 0.017843
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7641875 0.0021536 354.84 <2e-16 ***
## log(SubmissionCount) 0.0077695 0.0005954 13.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.005205 on 79 degrees of freedom
## Multiple R-squared: 0.6831, Adjusted R-squared: 0.6791
## F-statistic: 170.3 on 1 and 79 DF, p-value: < 2.2e-16
There’s a user with 228 submissions, which is close to double the amount
of the user with the second highest amount of submissions, at 122.
The logarithmic model seems to fit the data fairly well, particularly
before the 50th submission count, at which point the residuals become
larger. This isn’t surprising since larger counts have much fewer cases,
making the averages within each SubmissionCount past this point
unreliable.
The model predicts the following scores at 1, 5, 10, 15 and 20
submissions:
pred.scores <- predict(lm_titanic.lb.score, newdata = data.frame(SubmissionCount = c(1,5,10,15,20)))
names(pred.scores) <- c("1 Sub","5 Subs","10 Subs","15 Subs","20 Subs")
pred.scores
## 1 Sub 5 Subs 10 Subs 15 Subs 20 Subs
## 0.7641875 0.7766921 0.7820775 0.7852277 0.7874629
The sample of 891 I’m working with corresponds to roughly 68% of the
entire passenger population (estimated at 1316 total of passengers).
There were also roughly 900 crew members on board, which do not seem to
be represented in the sample.
cat("Survival Counts, Survived = 1")
table(titanic$Survived)
cat("\nSurvival Rates, Survived = 1")
prop.table(table(titanic$Survived))
## Survival Counts, Survived = 1
## 0 1
## 549 342
##
## Survival Rates, Survived = 1
## 0 1
## 0.6161616 0.3838384
Given that the overall survival rate in the sample is 38.4%, any subset
of variable categories, values or ranges which deviate considerably from
this rate, and which are sufficiently represented in the sample, are
likely to be good predictors.
The following are the potential predictor variables along with
preliminary hypothesis - mostly based on personal preconceptions - to
give initial direction to the analysis of the data.
“Survivability” and “P(Survived)” are used interchangeably throughout
the document.
# 2 empty strings recoded as 'S'. Reason explained in Embarked section.
titanic$Embarked[titanic$Embarked == ''] <- 'S'
# Recoding 'Sex' and 'Embarked' variables as factors
titanic <- titanic %>%
mutate_at(c('Sex', 'Embarked'), as.factor)
cat('Dataset base variables\n')
names(titanic)
cat('\nVariable Summary\n')
summary(titanic)
## Dataset base variables
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
##
## Variable Summary
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## female:314 Min. : 0.42 Min. :0.000 Min. :0.0000
## male :577 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 C:168
## Class :character 1st Qu.: 7.91 Class :character Q: 77
## Mode :character Median : 14.45 Mode :character S:646
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
The description of the variables can be found at: https://www.kaggle.com/competitions/titanic/data
Pclass: An ordinal variable with three categories which function
as a proxy for social class (Upper, Middle, Lower).
Hypothesis: The higher the social class, the higher the
survivability.
Name: Passenger name, including the title and surname. It might
perhaps be possible to identify nationalities based on surname and
create a subclass of aristocrats (based on title).
Sex: The baseline submission uses Sex as a predictor.
Age: A numerical variable. The ages for individuals under the age
of 1 are fractional. I am disinclined to believe these fractional values
are relevant.
Hypothesis: Younger people have a higher survivability
rate.
My intuition tells me the relationship will not be one of simple
linearity. I’m more inclined to expect very high survivability rates at
young ages with values significantly dropping off and stabilizing around
the ages 14-18.
SibSp and Parch: These variables provide the amount of
siblings/spouses and the amount of parents/children aboard the titanic,
respectively, for each observation. SibSp + Parch should return the
amount of family members (excluding self) for any given passenger.
Hypothesis: Larger families have lower
survivabilities.
After controlling for the interaction with Pclass (I expect 3rd class
passengers to have more children, and therefore lower survivabilities
because of that), I hypothesise that we’ll still observe lower
survivabilities for families with large numbers, because:
a) parents may not be keen to leave children behind;
b) parents (especially women) with very young children may be given
preference when boarding lifeboats.
These two effects should give an advantage when adults have one or two
children, but less so when they have no children, or lots of
children.
Ticket: A string with the ticket number. A quick inspection
through the data shows that some patterns can be identified: e.g.,
tickets with the letters ‘PC’ or ‘STON’. At this point I don’t know what
to make of it and if it’s relevant information that is not already
contained in pclass.
Fare: I’d expect the fare to be closely related to the ticket
class. Nevertheless, it will be worthwhile to check for outliers that do
not fit this pattern, and also if we can identify sub classes within the
ticket classes.
Cabin: A string with the Cabin assigned to that passenger. Likely
correlated with Pclass. I might use external data in the form of a map
of the titanic to check the cabin locations.
Embarked: Categorical variable corresponding to the port of
embarkation (‘C’ = Cherbourg, France; ‘Q’ = Queenstown, Ireland; ‘S’ =
Southampton, England).
Below is a correlation matrix:
# converting Sex to dummy variable, with 'male' as baseline.
# Required to compute correlation.
titanic$Sex_F<-ifelse(titanic$Sex == "female",1,0)
# Dummy variables for embarked
titanic <- titanic %>%
mutate(Embark_Q = ifelse(Embarked == 'Q',1,0),
Embark_C = ifelse(Embarked == 'C',1,0),
Embark_S = ifelse(Embarked == 'S',1,0))
correlation_matrix <- correlate(titanic, use = "pairwise.complete.obs")
# Conducting partial correlation for Fare x Survived
temp.titanic <- titanic %>%
select(Survived, Fare, Pclass) # controlling for Pclass
cat('\nPartial Correlation of Fare x Survived (control for Pclass) \n')
pcor(c('Survived', 'Fare', 'Pclass'), var(temp.titanic))
##
## Partial Correlation of Fare x Survived (control for Pclass)
## [1] 0.0907064
As addressed in the introductory section, the tutorial for the titanic
kaggle competition already provides an example file which predicts
survivability based on gender (if female, Survival = 1, else Survival =
0). This gender model has a score of 0.76555, which serves as the
baseline we’re looking to improve upon.
cat('\nTotal Count')
table(titanic$Sex) -> tbl1
tbl1
cat('\nProportion relative to the total')
prop.table(table(titanic$Sex)) -> prop.tbl1
prop.tbl1
cat('\nCount per category of Survived, Survived = 1')
table(titanic$Survived, titanic$Sex) -> tbl2
addmargins(tbl2)
cat('\nRelative Gender frequencies within each category of Survived')
prop.table(tbl2, margin = 1) -> prop.tbl2
prop.tbl2
cat('\nRelative Survival frequencies within each category of Sex')
prop.table(tbl2, margin = 2) -> prop.tbl3
prop.tbl3
##
## Total Count
## female male
## 314 577
##
## Proportion relative to the total
## female male
## 0.352413 0.647587
##
## Count per category of Survived, Survived = 1
## female male Sum
## 0 81 468 549
## 1 233 109 342
## Sum 314 577 891
##
## Relative Gender frequencies within each category of Survived
## female male
## 0 0.1475410 0.8524590
## 1 0.6812865 0.3187135
##
## Relative Survival frequencies within each category of Sex
## female male
## 0 0.2579618 0.8110919
## 1 0.7420382 0.1889081
Of the 891 people in the sample, there are 314 women (35%) against 577
men (65%).
Despite only about 35% of passengers in the sample being women, they account for 68% of the 342 survivals. On the other hand, of the 549 casualties in the sample, 85% are men.
All other variables ignored, if we were to select a woman at random from the sample, there’s a 74% chance she survived. That value is only about 19% if we were to select a man at random.
We can extrapolate the survivability among women to the population by
calculating a confidence interval, given by: \(\begin{aligned} \hat{p} \pm Z
\sqrt{\frac{\hat{p}(1 - \hat{p})}{n}} \end{aligned}\)
# Calculating the standard error of a proportion, without correction
SE_p <- function(proportion, sample_size) {
sqrt(proportion * (1 - proportion) / sample_size)
}
SE_p(prop.tbl3[2,1], 314) # SE for the proportion of survivals among women
prop.tbl3[2,1] + 1.96 * SE_p(prop.tbl3[2,1], 314) # upper limit
prop.tbl3[2,1] - 1.96 * SE_p(prop.tbl3[2,1], 314) # lower limit
## [1] 0.02469028
## [1] 0.7904312
## [1] 0.6936453
The standard error for the proportion of women who survived at the
titanic, given our sample and without correction, is 0.0247, or about
2.47%.
For a 95% confidence interval, we calculate the range around the sample
proportion equivalent to roughly 2 standard errors (1.96, to be
precise).
Given our sample, the estimated survival rate among women at the
titanic, for a 95% CI, is between 69.4% to 79%.
But if our sample already contains 68% of cases we cannot treat it the
same way as we would treat a sample of 2%, for instance, or a sample
from a potentially infinite population. The variability left to explain
is reduced in our scenario. So we multiply the standard error by the
finite population factor to correct for the proportion of our sample in
the total population.
\(\begin{aligned}FPC =
\sqrt{\frac{(N-n)}{N-1}}\end{aligned}\)
fpc <- function(pop_size, sample_size) {
sqrt((pop_size - sample_size) / (pop_size - 1))
}
titanic.fpc <- round(fpc(1316,891),4)
se_w <- SE_p(prop.tbl3[2,1], 314) * titanic.fpc # corrected SE
se_w
prop.tbl3[2,1] + 1.96 * se_w # upper limit
prop.tbl3[2,1] - 1.96 * se_w # lower limit
## [1] 0.01403642
## [1] 0.7695496
## [1] 0.7145268
The Confidence Interval means that, if we were to obtain new samples of
the same size from the same population, with replacement, as the number
of samples tend to infinity, 95% of those samples would produce
confidence intervals that contain the true population proportion.
A 95% CI does not mean that there’s a 95% chance the true population
proportion is between 71.5% and 77%. It’s just an interval of plausible
values at a given confidence level with an associated error.
We could, for instance, lower the CI to 90%, which gives a narrower
range, but that increases the chance of the range excluding the
population proportion.
Finally, in the case of a proportion, apart from the size of the sample,
the further away our proportion is from 0.5, the more precise the
estimate will be (the highest possible numerator is 0.5 * 0.5 =
0.25).
This means that the interval for survivability in the case of men will
be narrower, since we’re increasing the subset size from 314 to 577, and
moving further away from 0.5 with a 0.19 probability.
se_m <- SE_p(prop.tbl3[2,2], 577) * titanic.fpc # corrected SE
se_m
prop.tbl3[2,2] + 1.96 * se_m # upper limit
prop.tbl3[2,2] - 1.96 * se_m # lower limit
## [1] 0.009264093
## [1] 0.2070658
## [1] 0.1707505
We can also conduct a chi-square analysis to verify if a pair of
categorical variables are independent, despite it being clear enough
from our proportions and sample size that there’s a statistically
significant relationship between gender and survivability. In any
case:
cat("Contigency Table of Observed values for Survived x Sex\n")
table(titanic$Survived, titanic$Sex) -> tbl2
addmargins(tbl2)
## Contigency Table of Observed values for Survived x Sex
##
## female male Sum
## 0 81 468 549
## 1 233 109 342
## Sum 314 577 891
Pearson’s Chi-Square test calculates how well the expected values in a
contingency table fit the actual observed values.
The expected values are the frequencies in each cell that would be
expected to be observed if they were true to their category proportions.
When this occurs, the discrepancies between the cells merely reflect the
differences in these proportions.
Therefore, the null hypothesis of the test is that there is no
statistically significant difference between the observed and the
expected values.
\(H_0:\) Variables Sex and Survived are
independent.
The expected values may be found as such: we obtain the proportion of a
given variable category, then multiply by the categories of the second
variable. In this case:
\(\begin{aligned}\frac{Sex_j}{Sample.Size} *
Survived_{i} = Cell.Freq_{ij}\end{aligned}\)
So:
Proportion of women = 314 / 891 = 0.352413
Proportion of men = 1 - 0.352413 = 0.647587
Given these proportions, we can obtain the expected values for each
category of the variable Survived:
0.352413 * 549 = 193.5 # Expected value of amount of women that
died
0.352413 * 342 = 120.5 # Expected value of amount of women that
survived
549 - 193.4747 = 355.5 # Expected value of amount of men that died
342 - 120.5252 = 221.5 # Expected value of amount of men that
survived
cat("Contigency Table of Expected values for Survived x Sex\n\n")
list1 <- c(193.5,120.5,355.5,221.5)
Sex.Survived.mtx <- matrix(list1, nrow = 2, ncol = 2) # new table with EVs
rownames(Sex.Survived.mtx) <- c('0', '1')
colnames(Sex.Survived.mtx) <- c('Female', 'Male')
addmargins(Sex.Survived.mtx)
## Contigency Table of Expected values for Survived x Sex
##
## Female Male Sum
## 0 193.5 355.5 549
## 1 120.5 221.5 342
## Sum 314.0 577.0 891
These are the frequencies we would expect to find in each cell if the
categories of the variable Sex were independent of the categories of the
variable Survived.
The resulting table distributes the frequencies by each cell in a way
that retains the proportions in each variable.
Finally, we verify if the differences between the Observed and Expected
values are actually significant.
We can do so manually:
\(\begin{aligned}\chi^2 = \sum \frac{(O_{ij} -
E_{ij})^2}{E_{ij}}\end{aligned}\)
\(\begin{aligned}\chi^2 = \frac{{\left(
81-193.5 \right)^2}}{193.5}+\frac{{\left( 233-120.5
\right)^2}}{120.5}+\frac{{\left( 468-355.5
\right)^2}}{355.5}+\frac{{\left( 109-221.5 \right)^2}}{221.5} = 65.4 +
105 + 35.6 + 57.1 = \color{#FF3D3D} \Large
263.1\end{aligned}\)
\(\begin{aligned}df=(r−1)×(c−1),
df=1\end{aligned}\)
Or using R:
chisq.test(tbl2, correct = F) #2x2 Contigency Table, without Yate's continuity correction
##
## Pearson's Chi-squared test
##
## data: tbl2
## X-squared = 263.05, df = 1, p-value < 2.2e-16
The chi-square statistic we obtain is compared against a standardized
chi-square distribution with df=(r−1)×(c−1) degrees of freedom. It gives
us the probability of finding a chi-square statistic at least that high
if the null hypothesis of there being no statistical difference between
observed and expected values were correct.
The returned p-value of “< 2.2e-16” is what R defaults to when the p-value is exceedingly low.
For context, a p-value of 0.01 corresponds to a chi-square statistic of 6.634. A p-value as low as 0.0001 corresponds to a chi-square statistic of 15.13.
qchisq(0.01, 1, lower.tail = FALSE)
qchisq(0.0001, 1, lower.tail = FALSE)
## [1] 6.634897
## [1] 15.13671
In particular when dealing with larger contingency tables, it is often
informative to look at the residuals for each given cell to identify
which category pairings are actually driving the significance in the
chi-square result.
CrossTable(tbl2, fisher = F, chisq = T, expected = T, resid = T, sresid = T, asresid = T, format = "SPSS", digits = 1)
##
## Cell Contents
## |-------------------------|
## | Count |
## | Expected Values |
## | Chi-square contribution |
## | Row Percent |
## | Column Percent |
## | Total Percent |
## | Residual |
## | Std Residual |
## | Adj Std Resid |
## |-------------------------|
##
## Total Observations in Table: 891
##
## |
## | female | male | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 81 | 468 | 549 |
## | 193.5 | 355.5 | |
## | 65.4 | 35.6 | |
## | 14.8% | 85.2% | 61.6% |
## | 25.8% | 81.1% | |
## | 9.1% | 52.5% | |
## | -112.5 | 112.5 | |
## | -8.1 | 6.0 | |
## | -16.2 | 16.2 | |
## -------------|-----------|-----------|-----------|
## 1 | 233 | 109 | 342 |
## | 120.5 | 221.5 | |
## | 105.0 | 57.1 | |
## | 68.1% | 31.9% | 38.4% |
## | 74.2% | 18.9% | |
## | 26.2% | 12.2% | |
## | 112.5 | -112.5 | |
## | 10.2 | -7.6 | |
## | 16.2 | -16.2 | |
## -------------|-----------|-----------|-----------|
## Column Total | 314 | 577 | 891 |
## | 35.2% | 64.8% | |
## -------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 263.0506 d.f. = 1 p = 3.711748e-59
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 260.717 d.f. = 1 p = 1.197357e-58
##
##
## Minimum expected frequency: 120.5253
The table above summarises the data I had previously manually
calculated. It also includes the residuals, standardized, and adjusted
standardized residuals, which I’ll address now.
Residuals
The residuals are simply the difference between the observed values and
the expected values for each given cell:
Res(1,1) = 81 - 193.5 = -112.5
Res(2,1) = 233 - 120.5 = 112.5
Res(1,2) = 468 - 355.5 = 112.5
Res(2,2) = 109 - 221.5 = -112.5
Standardized Residuals
The standardized residual takes the absolute value of the residual in a
cell and divides it by \(\sqrt{E}\),
which is the standard deviation of the expected value in that cell,
therefore measuring how many standard deviations the observed value is
from the expected count:
St.Res(1,1) = \({\frac{{\left( 81-193.5
\right)}}{\sqrt{193.5}}}=-8.08\)
In the case of cell (1,1), the observed values are 8.1 standard
deviations below the expected value.
Adjusted Standardized Residuals
The adjusted standardized residuals take into consideration the expected
proportions in the overall table, and are used as a more reliable
indication of the true distance between the observed and expected values
in a cell. The denominator becomes:
\(\sqrt{E\cdot (1-r_i)\cdot(1-c_j)}\),
where:
\(r_i\) = row proportion
\(c_j\) = column proportion
Using the example of cell (1,1):
We’re attempting to predict a binary outcome from several continuous and
categorical predictors. Logistic regression allows us to maintain the
assumption of linearity by converting the outcome Y from its original
unit to the log-odds of P(Y) occurring.
Whereas, in linear regression, a b coefficient represents the change in
the outcome variable Y for a one unit change in the respective
predictor, in logistic regression the coefficient is the change in the
logit (the natural logarithm of the odds of P(Y)) of the outcome
variable for a one unit change in the respective predictor.
For the moment we’re using a single predictor, so the equation is given
by:
\(P(Y)=\frac{1}{1+e^{-(b_0+b_1X_{1i})}}\)
I’ll create a new dummy variable Sex_F, generate a logistic regression
model, and finally go through each statistic in the returned
results:
gender.model <- glm(Survived ~ Sex_F, data = titanic, family = binomial())
summary(gender.model)
cat('\nPseudo R^2s (Hosmer & Lemeshow method)')
(1186.7 - 917.8)/ 1186.7
##
## Call:
## glm(formula = Survived ~ Sex_F, family = binomial(), data = titanic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4571 0.1064 -13.70 <2e-16 ***
## Sex_F 2.5137 0.1672 15.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.7 on 890 degrees of freedom
## Residual deviance: 917.8 on 889 degrees of freedom
## AIC: 921.8
##
## Number of Fisher Scoring iterations: 4
##
##
## Pseudo R^2s (Hosmer & Lemeshow method)[1] 0.2265948
Log-Likelihood &
Deviance
The drop in unexplained deviance from 1186.7 to 917.8 tells us that the
addition of predictor “Sex” has improved our ability to predict whether
someone survived versus the baseline. These deviances are obtained by
multiplying the log-likelihood of each model by -2.
The log-likelihood statistic serves a purpose analogous to the sum of
squares for the linear model: the statistic is a measure of the amount
of unexplained deviance that remains in the model.
Personally I do not find the log-likelihood values anywhere near as
intuitive as the sum of squares, but they are easy enough to calculate.
The log-likelihoods are obtained as follows:
\(LL = \sum_{i=1}^{n} \left[ Y_i \ln (P(Y_i))
+ (1 - Y_i) \ln (1 - P(Y_i))\right]\)
Whether \(Y_i= 0\), or \(Y_i= 1\), one portion of the formula gets
multiplied by 0 and therefore cancelled. So, in practice, we get:
\(LL (Survived=1) = \sum_{i=1}^{n}\ln
(P(Survived_i))\)
\(LL (Survived=0) = \sum_{i=1}^{n}\ln
(1-P(Survived_i))\)
We simply insert the appropriate probabilities and given counts for
variable Survived to obtain the Null deviance. To recall, these are:
cat('\nFrequencies for variable "Survived" (Survived = 1)')
table(titanic$Survived) -> tbl.survived1
tbl.survived1
cat('\nRelative Frequencies for variable "Survived"')
prop.table(tbl.survived1)
##
## Frequencies for variable "Survived" (Survived = 1)
## 0 1
## 549 342
##
## Relative Frequencies for variable "Survived"
## 0 1
## 0.6161616 0.3838384
\(LL (Survived=1) = 342 * \ln (0.383838) =
-327.4769\)
\(LL (Survived=0) = 549 * \ln (0.616161) =
-265.8516\)
\(-327.4769 + (-265.8516) =
-593.3285\)
The deviance statistic is then obtained by multiplying the
log-likelihood by -2:
\(-2LL(Baseline) = -2*-593.3285 =
\textbf{1186.657}\)
Note: The resulting values are always negative, since the maximum
possible value of P(Y) is 1, and the natural logarithm of 1 is 0.
We next calculate the model residual deviance, using the conditional
probabilities previously calculated, P(Survived|Sex):
cat('\nSurvived x Sex table frequencies')
addmargins(tbl2)
cat('\nRelative Survival frequencies within each category of Sex')
prop.tbl3
##
## Survived x Sex table frequencies
## female male Sum
## 0 81 468 549
## 1 233 109 342
## Sum 314 577 891
##
## Relative Survival frequencies within each category of Sex
## female male
## 0 0.2579618 0.8110919
## 1 0.7420382 0.1889081
The log-likelihood for the conditional probabilities are:
\(LL_{survived|female} = 233 *\ln
(0.7420)=-69.52861\)
\(LL_{died|female} = 81 *\ln
(0.2580)=-109.7385\)
\(LL_{survived|male} = 109 *\ln
(0.1889)=-181.6526\)
\(LL_{died|male} = 468 *\ln
(0.8111)=-97.98232\)
\(LL_{(Survived\sim
Sex)}=(-69.52861)+(-109.7385)+(-181.6526)+(-97.98232)=-458.902\)
\(-2LL(Gender Model) = -2 * -458.902 =
\textbf{917.8}\)
Likelihood Ratio
& Chi-Square (Again)
The likelihood ratio can be directly calculated using the formula:
\(L{\chi^2}=2\sum{observed_{ij}*ln(\frac{observed_{ij}}{expected_{ij}})}\)
For large sample sizes, the likelihood ratio and Pearson’s Chi-Square
will be approximate. Both Pearson’s chi-square and the likelihood ratio
return a statistic which can be compared against a chi-square
distribution.
When testing for independence of the variables in a contingency table,
we might prefer the likelihood ratio when the sample size is small, or
when the precondition of 5 observations per cell for conducting a
chi-square test isn’t verified.
In the context of logistic regression, we use the likelihood ratio to
test whether the improvement in the model is statistically significant.
It serves a purpose similar to the F-ratio for a linear regression
model. We obtain the statistic by subtracting the model deviance from
the null deviance. The value is the same as if we were to use the
formula above. So:
\(L{\chi^2}=Baseline
-Model=1186.7-917.8=\textbf{268.9}\)
\(df=k_{model}-k_{baseline} = 2-1=1\),
where k is the number of predictors plus the constant.
As expected, we obtain a statistic similar to the chi-square statistic
(263.05).
We conclude the new model is predicting the outcome significantly better
than baseline, \(\chi^2(1)=268.9, p <
2.2e-16\).
Wald’s
Z-statistic
Unlike linear regression, the estimates are not in the original units of
the outcome variable. The estimates are the natural logarithm of the
odds of P(Y) for one additional unit of a given predictor. These will
need to be transformed later back into odds and/or probabilities.
Nevertheless, the analysis is identical: the estimate has an associated
standard error and a standardized z-statistic which gives us the
relative size of the estimate in standard error units. This z-statistic
can then be compared against a standardized normal distribution of
mean=0 and sd=1:
\(z=\frac{b}{SE_b}=\frac{2.5137}{0.1672}=15.04\)
A z-statistic of 15.04 is well above the conventional cutoff points
(e.g. 1.96, 2.58), and returns a very low probability of occurrence
under the null hypothesis (p < 2e-16).
The probability of obtaining such an estimate by chance alone is
extremely low. Therefore, we conclude that being a woman is a
statistically significant predictor of survivability at the
Titanic.
Odds
Ratio
I’ll first convert the logistic regression function into R code:
log.regression <- function(constant, coefficient) {
prob.y <- 1 / (1 + exp(-(constant+coefficient)))
prob.y
}
Next I use the logistic function to calculate the probability of Y given
Sex=0 (male) and Sex=1 (female):
log.regression(-1.4571,2.5137*0) # P(Survived=1, Gender=Male)
log.regression(-1.4571,2.5137*1) # P(Survived=1, Gender=Female)
## [1] 0.1889113
## [1] 0.7420403
Naturally, the probabilities returned are the same ones we obtained when
we built the tables of relative frequencies (differences are due to
rounding errors):
109/577 # proportion of survivals among men
233/314 # proportion of survivals among women
cat('\nRelative Survival frequencies within each category of Sex')
prop.tbl3
## [1] 0.1889081
## [1] 0.7420382
##
## Relative Survival frequencies within each category of Sex
## female male
## 0 0.2579618 0.8110919
## 1 0.7420382 0.1889081
\(P(Survived|Female)=\frac{P(Survived\cap
Female)}{P(Female)}=233/314=0.7420\)
\(P(Survived|Male)=109/577=0.1889\)
\(P(Died|Female)=1-0.7420=0.258\)
\(P(Died|Male)=1-0.1889=0.8111\)
So it seems rather pointless to use logistic regression to obtain
probabilities for a binary outcome predicted from a single binary
variable. But this makes it easier to grasp logistic regression at an
intuitive level, which is useful to later not get bogged down in the
numbers when I start adding more predictors.
Using a simple categorical predictor also makes the Odds Ratio easier to
grasp. The odds ratio is the ratio between the odds of Y after a unit
change in the predictor, divided by the base odds.
Calculating Odds:
\(Odds(Survived|Female) =
\frac{0.742}{0.258}=2.876\)
We can interpret these odds as meaning that, roughly, for every 3 women
that survived at the titanic, 1 died.
\(Odds(Survived|Male) =
\frac{0.189}{0.811}=0.233\)
On the other hand, a man at the titanic was over 4 times more likely to
have died than survived. Or, for every man that survived, 4 died.
Knowing these odds, we can calculate the Odds Ratio, which will give us
the incremental odds of a one unit change in the predictor. In this
example, a one unit change in the predictor means simply comparing the
odds of survival among women with the odds of survival among men
(because 0=male and 1=female).
\(Odds Ratio
=\frac{Odds(Survived|Female)}{Odds(Survived|Male)}=\frac{2.876}{0.233}=12.343\)
This tells us that the odds of a woman at the Titanic to have survived
were about 12 times larger than the odds for a man.
Alternatively, by raising the constant \(e\) to the power of either coefficient, we
convert the logit values directly to Odds:
gender.model$coefficients
cat('\nCoefficients converted to Odds\n')
exp(gender.model$coefficients)
## (Intercept) Sex_F
## -1.45712 2.51371
##
## Coefficients converted to Odds
## (Intercept) Sex_F
## 0.232906 12.350663
\(e^{Intercept} =
e^{-1.4571}=0.233\)
\(e^{SexF} = e^{2.5137}=12.35\)
The intercept are the Odds of Survival where Sex=0, and the Sex_F
coefficient is the ratio between the Odds of Survival where Sex=1 and
the base odds of the intercept.
\(R_L^2\) & AIC
We already tested whether the drop in unexplained deviance for the model
compared to baseline is statistically significant using a chi-square
test. Next we can obtain two statistics which will allow us to verify
whether our model is improving as we introduce new predictors.
One such statistic is the Hosmer & Lemeshow’s statistic (\(R_L^2\)), which is a straightforward
analogue of the coefficient of determination (\(R^2\)):
\(R_L^2=\frac{NullDeviance-Residual
Deviance}{NullDeviance}=\frac{1186.7-917.8}{1186.7}=0.2266\)
The logic here is the same as with (\(R^2\)): we subtract the new model’s
unexplained deviance by the baseline total unexplained deviance, thus
obtaining the amount of deviance in the outcome explained by the
predictors, and we divide the explained deviance by the total
unexplained deviance to obtain the proportion of total deviance in Y
that is explained by the predictors in the model.
Next we calculate the Akaike Information Criterion (AIC). \(R^2\) naturally increases as new variables
are added to a model. A bloated model with a myriad of poorly thought
predictors will have a higher R^2 than a tighter model with less
predictors.
The AIC is a statistic which penalizes bloated models with unnecessary
predictors. By itself the statistic is meaningless, but it is useful to
compare AICs as we add new predictors. If the AIC increases, the model
fit is worse.
For a logistic regression model, we obtain it by:
\(AIC = −2LL + 2k\), where k is the
number of predictors + intercept
\(AIC_{Gender.Model}=917.8 +
2*2=921.8\)
# Diagnostics (See Custom Functions)
model_tib(gender.model,titanic,PassengerId, Sex_F)
view(gender.model_tib)
cat('\nFrequencies of Survived x Pclass')
class.surv.tbl <- table(titanic$Survived, titanic$Pclass)
addmargins(class.surv.tbl)
cat('\nProportion of Passengers per PClass')
round(
prop.table(
table(titanic$Pclass)), 3)
cat('\nProportion of Survivals per PClass\n')
round(
prop.table(
table(titanic$Survived, titanic$Pclass), margin = 2)[2,],3)
cat('\nFrequencies of Sex x Pclass')
addmargins(table(titanic$Sex, titanic$Pclass))
cat('\nProportion of Survived for each category of Pclass, for women')
titanic.women <- titanic %>%
filter(Sex == 'female')
round(
prop.table(
table(titanic.women$Survived, titanic.women$Pclass),
margin = 2) # calculates column proportions, i.e., P(Survived|Pclass)
,3) # rounds to 3 decimals
cat('\nProportion of Survived for each category of Pclass, for men')
titanic.men <- titanic %>%
filter(Sex == 'male')
round(
prop.table(
table(titanic.men$Survived, titanic.men$Pclass),
margin = 2)
,3)
# Barplot of Survived=1 Frequencies overlapped with Pclass frequencies
Pclass.surv.counts <- titanic %>%
group_by(Pclass) %>%
summarise(Count = n(),
Survived = sum(Survived == 1))
Pclass.surv.counts %>%
ggplot(aes(factor(Pclass))) +
geom_bar(aes(y = Count, fill = 'Died'),
stat = 'identity') +
geom_bar(aes(y = Survived, fill = 'Survived'),
stat = 'identity') +
scale_fill_manual(values = c('Died' = 'grey30', 'Survived' = 'firebrick2')) +
labs(x = 'Passenger Class', y = 'Frequencies', fill = 'Legend',
title = 'Survival Frequencies per Passenger Class') +
theme_bw()
##
## Frequencies of Survived x Pclass
## 1 2 3 Sum
## 0 80 97 372 549
## 1 136 87 119 342
## Sum 216 184 491 891
##
## Proportion of Passengers per PClass
## 1 2 3
## 0.242 0.207 0.551
##
## Proportion of Survivals per PClass
## 1 2 3
## 0.630 0.473 0.242
##
## Frequencies of Sex x Pclass
## 1 2 3 Sum
## female 94 76 144 314
## male 122 108 347 577
## Sum 216 184 491 891
##
## Proportion of Survived for each category of Pclass, for women
## 1 2 3
## 0 0.032 0.079 0.500
## 1 0.968 0.921 0.500
##
## Proportion of Survived for each category of Pclass, for men
## 1 2 3
## 0 0.631 0.843 0.865
## 1 0.369 0.157 0.135
cor.test(titanic$Survived,titanic$Pclass, method = c('spearman'))
##
## Spearman's rank correlation rho
##
## data: titanic$Survived and titanic$Pclass
## S = 157935034, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.3396679
Questioning the
assumption of Ticket Class as proxy for Social Class
I’m confident that the possibility of upper class passengers travelling
in 3rd Class, or of working class passengers travelling in 1st Class, is
so remote that both ticket classes are good proxies for social
class.
However, I’m not as confident that middle class passengers are as
closely associated with 2nd Class tickets, especially in regards to
middle class passengers choosing to travel in 1st or in 3rd class.
My reasoning is that, if there’s a wide gap between the distributions of
fares for each ticket class, then some passengers, especially middle
class, may choose to save money and travel 3rd class. If on the other
hand the differences in fare aren’t very large, then some middle class
passengers may choose to travel 1st class (and some working class
passengers may even travel 2nd class).
There’s a lot of theory and assumptions going on here which, if this
were a proper study, would require a thorough revision of the
literature. But that is not the purpose of this document, so I apologize
for the unsupported claims in this section.
The choice between spending now or saving for such goals as retirement,
education or property ownership, are more typically associated with the
middle class. Lower classes do not have much disposable income to save
in the first place, and the upper classes have greater incentives to
maintain their social status and networks (plus, luxuries such as a 1st
class cruiser ticket are often a small portion of their incomes). This
isn’t a static phenomenon across history, but it may be an adequate
theoretical model of early 20th century western societies.
Why this might matter? Theoretically, there’s the possibility middle
class passengers travelling in 3rd class had a higher chance of survival
than other 3rd class passengers, if outward appearance of social status
was a factor during selection for lifeboats. And in the same manner,
middle class passengers travelling 1st class would have a lower chance
of survival than actual upper class passengers.
Unfortunately, I’m not confident this is a hypothesis that I’ll be able
to test, but I’d still rather have that awareness in case evidence for
it does show up elsewhere later.
In short, middle class passengers may have a greater choice between
buying 1st, 2nd or 3rd class tickets than either the upper or lower
classes: the first are unable (or unlikely) to do so as it’s a strategy
anathema to the preservation of social status among the elite, and the
latter are unable to do so due to material constraints (i.e., low
incomes).
It is for these reasons that I wish to look at the distributions of
fares before proceeding with the analysis of Pclass. I’ll then return to
variable Fare at a later stage.
Analysis of Variable
Fare
cat('\nAverage Fare per Class\n')
tapply(titanic$Fare, titanic$Pclass, mean)
titanic %>%
filter(Pclass %in% c(1,2)) %>%
ggplot(aes(as.factor(Pclass),Fare)) + # boxplot requires factor
geom_boxplot() +
labs(x = 'Pclass',
title='Boxplot comparison of fares for 1st and 2nd Class') +
theme_bw() -> temp1
titanic %>%
filter(Pclass %in% c(2,3)) %>%
ggplot(aes(as.factor(Pclass),Fare)) +
geom_boxplot() +
labs(x = 'Pclass',
title='Boxplot comparison of fares for 2nd and 3rd Class') +
theme_bw() -> temp2
titanic %>% # Inspecting the lowest 25% of observations for 1st Class fares. (Will not print)
filter(Pclass == 1) %>%
filter(Fare <= quantile(Fare, 0.25)) %>%
view()
titanic %>% # Inspecting the IQR for 2nd class fares. (Will not print)
filter(Pclass == 2) %>%
filter(Fare > quantile(Fare, 0.25) & Fare <= quantile(Fare, 0.75)) %>%
view()
temp1 + temp2
##
## Average Fare per Class
## 1 2 3
## 84.15469 20.66218 13.67555
(Note: the long decimals in the values of variable Fare are caused by
converting shillings and pennies to decimals).
The first plot shows there isn’t a lot of overlap between 1st class and
2nd class ticket prices. Although the lowest 25% of cases in the
distribution of 1st class fares go all the way to 0, further inspection
reveals exactly 5 cases of 1st class passengers who did not pay a fare,
and exactly one unusual value of 5. Excluding these 6 cases, the lowest
fare in upper class is 25.5875, which is about the same as the value
found at the third quartile in the distribution of fares for Pclass 2 (a
value of 26).
The Boxplot of fares for Pclass 1 also reveals the presence of outliers.
There are three cases with fares of 512.3292, which is nearly double the
second highest fare of 263.
The second plot shows some overlap between the distribution of fares for
Pclass 2 and Pclass 3. The median of Pclass 2 is slightly lower than the
Q3 of Pclass 3. The medians of both categories are also very close to
their respective Q1s, implying there isn’t a significant difference in
fare between the observation at the 25th percentile, and the observation
at the 50th percentile.
However, there are also quite a few outliers in Pclass 3 which are well
above the Q3 of Pclass 2. I want to know why are some tickets in Pclass
3 so expensive, and also what the typical ticket prices were for Pclass
2 and 3.
Meaningless
continuity
cat('\nTop 20 fares in Pclass 3\n')
titanic %>%
filter(Pclass == 3) %>%
select(Survived, Name, Sex, Age, SibSp, Parch, Ticket, Fare) %>%
arrange(desc(Fare)) %>%
head(20) %>% # top 20 results of Fare
more_tib(4) # Shows more digits in variable Fare (See Custom Functions)
##
## Top 20 fares in Pclass 3
## # A tibble: 20 × 8
## Survived Name Sex Age SibSp Parch Ticket Fare
## <int> <chr> <fct> <dbl> <int> <int> <chr> <dbl>
## 1 0 "Sage, Master. Thomas Henry" male NA 8 2 CA. 2… 69.55
## 2 0 "Sage, Miss. Constance Gladys" fema… NA 8 2 CA. 2… 69.55
## 3 0 "Sage, Mr. Frederick" male NA 8 2 CA. 2… 69.55
## 4 0 "Sage, Mr. George John Jr" male NA 8 2 CA. 2… 69.55
## 5 0 "Sage, Miss. Stella Anna" fema… NA 8 2 CA. 2… 69.55
## 6 0 "Sage, Mr. Douglas Bullen" male NA 8 2 CA. 2… 69.55
## 7 0 "Sage, Miss. Dorothy Edith \"D… fema… NA 8 2 CA. 2… 69.55
## 8 1 "Bing, Mr. Lee" male 32 0 0 1601 56.50
## 9 0 "Ling, Mr. Lee" male 28 0 0 1601 56.50
## 10 1 "Lang, Mr. Fang" male 26 0 0 1601 56.50
## 11 1 "Foo, Mr. Choong" male NA 0 0 1601 56.50
## 12 1 "Lam, Mr. Ali" male NA 0 0 1601 56.50
## 13 0 "Lam, Mr. Len" male NA 0 0 1601 56.50
## 14 1 "Chip, Mr. Chang" male 32 0 0 1601 56.50
## 15 0 "Goodwin, Master. William Fred… male 11 5 2 CA 21… 46.9
## 16 0 "Goodwin, Miss. Lillian Amy" fema… 16 5 2 CA 21… 46.9
## 17 0 "Goodwin, Master. Sidney Leona… male 1 5 2 CA 21… 46.9
## 18 0 "Goodwin, Master. Harold Victo… male 9 5 2 CA 21… 46.9
## 19 0 "Goodwin, Mrs. Frederick (Augu… fema… 43 1 6 CA 21… 46.9
## 20 0 "Goodwin, Mr. Charles Edward" male 14 5 2 CA 21… 46.9
Variable Fare is not what it seems to be, making the preceding
boxplot analysis no longer accurate.
(Note: the dataset description describes the variable simply as
“Passenger Fare”, which turns out is a half-truth at best. The lack of
clarification may have been a deliberate decision by the competition
designers).
The first and third highest fares for 3rd class passengers are,
respectively, 69.55 and 46.90. However, these are not individual ticket
prices, but group fares. Each individual of family Sage and family
Goodwin in the sample have the entire family fare applied to their row,
not the actual individual fare for each.
The second highest fare of 56.5 is also the group fare paid by several
Chinese passengers, not the price paid by each.
This makes the continuity of variable Fare meaningless, since in some
cases we get an individual fare whereas in others it’s a multiple of the
individual fare.
Worse, as I inspected it further, it seems it’s not even consistent. In
some cases passengers travelling together do seem to show individual
fares. So creating a new individual fare variable by obtaining the
average might not be as simple as calculating “(SibSp + Parch + 1) /
Fare”.
(Note: all members of the Sage family and all members of the Goodwin
family in the sample died. This leads to the reasonable expectation that
the remaining family members not in the sample have a low probability of
having survived. For the moment I won’t pursue this further, but it
reveals some of the potential in the variable Ticket for groups sharing
the same group ticket, and may be more promising than either SibSp or
ParCh.)
Fare and
Nationality
titanic %>%
filter(Pclass==1) %>%
ggplot(aes(Fare)) +
geom_histogram() +
theme_bw() -> p1
titanic %>%
filter(Pclass==2) %>%
ggplot(aes(Fare)) +
geom_histogram() +
theme_bw() -> p2
titanic %>%
filter(Pclass==3) %>%
ggplot(aes(Fare)) +
geom_histogram() +
theme_bw() -> p3
cat('\nTop 10 most common fares for Pclass 2')
sort(table(titanic$Fare[titanic$Pclass==2]), decreasing = TRUE)[1:10]
cat('\nTop 10 most common fares for Pclass 3')
sort(table(titanic$Fare[titanic$Pclass==3]), decreasing = TRUE)[1:10]
titanic %>% # Inspect fares between 7 and 8.05. (Will not print)
filter(Fare >= 7 & Fare <=8.05) %>%
view()
p1 + p2 + p3
##
## Top 10 most common fares for Pclass 2
## 13 26 10.5 0 21 26.25 73.5 11.5 13.5 23
## 42 29 24 6 6 6 5 4 4 4
##
## Top 10 most common fares for Pclass 3
## 8.05 7.8958 7.75 7.925 7.775 7.2292 7.25 7.8542 8.6625 7.225
## 43 38 34 18 16 15 13 13 13 12
Even though the distribution of fares can’t be relied upon, I still want
to know the most common fares for Pclass 2 and 3, since I expect these
will reflect the true individual fares.
The histograms do not provide meaningful distributions, but the peaks
make it clear where the most common fares are located. Printing the data
of interest reveals a fare of 13 in the case of Pclass 2, and between 7
and roughly 8 in Pclass 3. The most common fare in Pclass 2 is about
62.5% to 75% higher than the most common fares in Pclass 3.
These numbers are reliable because both the fare of 13 and the range of
7 to 8.05 have, overwhelmingly, a value of 0 in both SibSp and Parch,
meaning these are individual fares. (this is not the case, for instance,
for the second highest value of Pclass 2, 26, where most seem to be the
group fare for groups of two people).
But why is there such a wide range of values in fares of Pclass 3? At
surface level that doesn’t make lot of sense. What I found is that there
seems to be a clear association between the value of the Fare and the
nationality/ethnicity of the passenger:
cat('\nNames of passengers who paid exactly 7.8958\n')
titanic %>%
filter(Fare == 7.8958) %>%
select(Name, Fare) %>%
head(15) %>%
more_tib(5) # Show all digits of variable Fare (Custom Function)
##
## Names of passengers who paid exactly 7.8958
## # A tibble: 15 × 2
## Name Fare
## <chr> <dbl>
## 1 "Todoroff, Mr. Lalio" 7.8958
## 2 "Kraeff, Mr. Theodor" 7.8958
## 3 "Staneff, Mr. Ivan" 7.8958
## 4 "Petranec, Miss. Matilda" 7.8958
## 5 "Petroff, Mr. Pastcho (\"Pentcho\")" 7.8958
## 6 "Mionoff, Mr. Stoytcho" 7.8958
## 7 "Rekic, Mr. Tido" 7.8958
## 8 "Drazenoic, Mr. Jozef" 7.8958
## 9 "Turcin, Mr. Stjepan" 7.8958
## 10 "Nenkoff, Mr. Christo" 7.8958
## 11 "Naidenoff, Mr. Penko" 7.8958
## 12 "Mineff, Mr. Ivan" 7.8958
## 13 "Hendekovic, Mr. Ignjac" 7.8958
## 14 "Danoff, Mr. Yoto" 7.8958
## 15 "Denkoff, Mr. Mitto" 7.8958
As mentioned at the beginning of Part 1, the correlation matrix points
towards a substantial correlation between Pclass and Survived, as well
as with Fare and Age. I have sufficiently addressed variable Fare for
the moment, and I’ll look into the relationship between Pclass and Age
later on.
I’ll now add Pclass into the regression model. But first I’d like to
check if the assumption of linearity holds. I’ll check this by adding an
interaction term of Pclass against the log of itself. A significant
interaction term of this sort means there’s evidence of significant
curvature and the assumption of linearity is violated (for there to be
linearity the change in odds for each change in the value of Pclass
should be sufficiently constant.)
# Test of linearity: interaction between Pclass*log(Pclass)
genderclass.int.tbl <-
titanic %>%
select(PassengerId, Survived, Sex_F, Pclass) %>%
mutate(log.Pclass.Int = log(Pclass)*Pclass) # interaction term as new variable
testmodel <- glm(Survived ~ Sex_F + Pclass + log.Pclass.Int, data = genderclass.int.tbl, family = binomial())
summary(testmodel)
##
## Call:
## glm(formula = Survived ~ Sex_F + Pclass + log.Pclass.Int, family = binomial(),
## data = genderclass.int.tbl)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1151 1.4019 -0.082 0.935
## Sex_F 2.6419 0.1841 14.351 <2e-16 ***
## Pclass -0.2297 1.3208 -0.174 0.862
## log.Pclass.Int -0.4388 0.7906 -0.555 0.579
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.66 on 890 degrees of freedom
## Residual deviance: 826.89 on 887 degrees of freedom
## AIC: 834.89
##
## Number of Fisher Scoring iterations: 4
The row of interest above is log.Pclass.Int, which is not significant
(0.579), meaning there’s no evidence of significant curvature.
Alternatively I could have run Pclass as an ordered factor:
testmodel <- glm(Survived ~ Sex_F + factor(Pclass, ordered = T), data = titanic, family = binomial())
summary(testmodel)
##
## Call:
## glm(formula = Survived ~ Sex_F + factor(Pclass, ordered = T),
## family = binomial(), data = titanic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.25923 0.11400 -11.046 <2e-16 ***
## Sex_F 2.64188 0.18410 14.351 <2e-16 ***
## factor(Pclass, ordered = T).L -1.34739 0.15142 -8.898 <2e-16 ***
## factor(Pclass, ordered = T).Q -0.09373 0.16889 -0.555 0.579
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.66 on 890 degrees of freedom
## Residual deviance: 826.89 on 887 degrees of freedom
## AIC: 834.89
##
## Number of Fisher Scoring iterations: 4
But from what I understand, the “.L” coefficient doesn’t actually test
for linearity in the log-odds of Y given Pclass, only for a monotonic
relationship in the odds. A statistically significant monotonic trend
doesn’t exclude the possibility of non-linearity, so the first approach
should be more reliable.
Given the assumption is verified, I can proceed:
cat('\n Adding Pclass to the model\n')
genderclass.model <- glm(Survived ~ Sex_F + Pclass, data = titanic, family = binomial())
summary(genderclass.model)
cat('\n')
anova(gender.model,genderclass.model)
##
## Adding Pclass to the model
##
## Call:
## glm(formula = Survived ~ Sex_F + Pclass, family = binomial(),
## data = titanic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.6512 0.2410 2.703 0.00688 **
## Sex_F 2.6434 0.1838 14.380 < 2e-16 ***
## Pclass -0.9606 0.1061 -9.057 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.7 on 890 degrees of freedom
## Residual deviance: 827.2 on 888 degrees of freedom
## AIC: 833.2
##
## Number of Fisher Scoring iterations: 4
##
##
## Analysis of Deviance Table
##
## Model 1: Survived ~ Sex_F
## Model 2: Survived ~ Sex_F + Pclass
## Resid. Df Resid. Dev Df Deviance
## 1 889 917.8
## 2 888 827.2 1 90.608
Adding Pclass to the logistic model caused the residual deviance to drop
from 917.8 to 827.2, and the AIC to also drop from 921.8 to 833.2.
917.8 - 827.2 = 90.6. Any chi-square above 6.63 is statistically
significant at the 0.01 point for a chi-distribution with 1 degree of
freedom. This means the addition of Pclass significantly improved the
model.
As before, we can convert the log odds to obtain the fitted
probabilities. Since we’re no longer dealing with a single binary
predictor, the fitted probabilities of survival will no longer precisely
overlap with the sample proportions of survival, but they need to be
sufficiently close for the model to be representative of the data and
useful in inferring survivability for cases outside the sample.
cat('Estimated probabilities of survival:\n\n')
cat('Men 1st class')
log.regression(0.6512, -0.9606*1)
cat('Women 1st class')
log.regression(0.6512, 2.6434 + (-0.9606*1))
cat('\n')
cat('Men 2nd class')
log.regression(0.6512, -0.9606*2)
cat('Women 2nd class')
log.regression(0.6512, 2.6434 + (-0.9606*2))
cat('\n')
cat('Men 3rd class')
log.regression(0.6512, -0.9606*3)
cat('Women 3rd class')
log.regression(0.6512, 2.6434 + (-0.9606*3))
cat('\n')
cat('\nProportion of Survivals in the Sample\n')
survival_tbl <- titanic %>%
group_by(Sex, Pclass) %>%
summarise(Survivals = round(mean(Survived), 3)) %>%
pivot_wider(names_from = Pclass, values_from = Survivals)
as.data.table(survival_tbl) # data.table prints less unnecessary metadata
## Estimated probabilities of survival:
##
## Men 1st class[1] 0.4232612
## Women 1st class[1] 0.911654
##
## Men 2nd class[1] 0.2192573
## Women 2nd class[1] 0.7979289
##
## Men 3rd class[1] 0.09703606
## Women 3rd class[1] 0.6017591
##
##
## Proportion of Survivals in the Sample
## Sex 1 2 3
## 1: female 0.968 0.921 0.500
## 2: male 0.369 0.157 0.135
Despite the significance of the coefficients, and the reduction in the
residual deviance, the model is not appropriately reflecting the data.
The linear odds for Pclass as a whole are not appropriate when
considered for each Sex. We saw that 1st and 2nd class women in the
sample had survivability rates above 90%, with a sudden drop to 50% for
3rd class women. And that men travelling 1st class had survivability
rates well above the men from 2nd and 3rd class.
Applying the Pclass coefficient for each category of Sex, with no
consideration for the interaction between Pclass and Sex, causes the
predicted survivabilities in the model to be much smoother than the
sample suggests. Below is the full list of discrepancies between the
fitted value and the sample value:
1st Class Women: -5.7%
2nd Class Women: -12.3%
3rd Class Women: +10.2%
1st Class Men: -5.4%
2nd Class Men: +6.2%
3rd Class Men: -3.8%
The discrepancies are too high, especially for women travelling 2nd and
3rd Class.
To better reflect the data, the simplest approach might be to add an
interaction term. If that’s still not good enough, I might try to create
two new binary variables, one separating 3rd Class women from the rest,
and another separating 1st Class men from the rest.
cat('\nNew model with Interaction Term Sex*Pclass\n')
genderclass.int.model <- glm(Survived ~ Sex_F + Pclass + Sex_F*Pclass, data = titanic, family = binomial())
summary(genderclass.int.model)
cat('\n')
anova(genderclass.model, genderclass.int.model)
##
## New model with Interaction Term Sex*Pclass
##
## Call:
## glm(formula = Survived ~ Sex_F + Pclass + Sex_F * Pclass, family = binomial(),
## data = titanic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0077 0.2868 -0.027 0.979
## Sex_F 6.0493 0.8756 6.909 4.88e-12 ***
## Pclass -0.6419 0.1246 -5.152 2.58e-07 ***
## Sex_F:Pclass -1.3593 0.3202 -4.245 2.19e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.66 on 890 degrees of freedom
## Residual deviance: 803.12 on 887 degrees of freedom
## AIC: 811.12
##
## Number of Fisher Scoring iterations: 6
##
##
## Analysis of Deviance Table
##
## Model 1: Survived ~ Sex_F + Pclass
## Model 2: Survived ~ Sex_F + Pclass + Sex_F * Pclass
## Resid. Df Resid. Dev Df Deviance
## 1 888 827.20
## 2 887 803.12 1 24.075
Adding an interaction term to the model shows a significant interaction
between Sex and Pclass. We could already tell from observing the
survivability tables of Pclass for each Sex that this was the case. The
differences were too large for the possibility of them having occurred
by chance (i.e, the null hypothesis) to be a reasonable hypothesis. The
test proves it.
Furthermore, the analysis of deviance shows that adding the interaction
term is a significant improvement over the model without the
interaction, with a chi-square of 24.075.
As with the previous model, I want to actually look at the fitted
probabilities. We can expect them to be better than the previous one,
but might still not be good enough:
# Logistic regression formula with an interaction effect
log.regression_int <- function(b0,b1,x1,b2,x2,Int) {
prob.y <- 1 / (1 + exp(-(b0+(b1*x1)+(b2*x2)+(Int*x1*x2))))
prob.y
}
cat('Estimated probabilities of survival:\n\n')
cat('Men 1st Class')
log.regression_int(-0.0077,6.0493,0,-0.6419,1,-1.3593)
cat('Women 1st Class')
log.regression_int(-0.0077,6.0493,1,-0.6419,1,-1.3593)
cat('\n')
cat('Men 2nd Class')
log.regression_int(-0.0077,6.0493,0,-0.6419,2,-1.3593)
cat('Women 2nd Class')
log.regression_int(-0.0077,6.0493,1,-0.6419,2,-1.3593)
cat('\n')
cat('Men 3rd Class')
log.regression_int(-0.0077,6.0493,0,-0.6419,3,-1.3593)
cat('Women 3rd Class')
log.regression_int(-0.0077,6.0493,1,-0.6419,3,-1.3593)
## Estimated probabilities of survival:
##
## Men 1st Class[1] 0.3430797
## Women 1st Class[1] 0.9827136
##
## Men 2nd Class[1] 0.215599
## Women 2nd Class[1] 0.8848518
##
## Men 3rd Class[1] 0.1263747
## Women 3rd Class[1] 0.5094989
Overall I am content with the fit of the model at this stage. Next I’ll
look into diagnostic data to see if any additional information can be
revealed:
# Creating new tibble with diagnostic data (See Custom Functions)
model_tib(genderclass.int.model, titanic, PassengerId, Survived, Sex_F, Pclass)
# Subsetting and Counting Studentized Deviance Residuals greater than 1.96
genderclass.int.model_tib %>%
filter(abs(studentized.res) > 1.96) %>%
group_by(studentized.res) %>%
summarise(count = n()) %>%
as.data.table()
# New Survival Category with "Did Not Survive" category, for plotting purposes
genderclass.int.model_tib2 <- genderclass.int.model_tib %>%
mutate(SurvivalCategory = factor(ifelse(Survived == 0, "Did Not Survive", as.character(round(predicted.prob, 2))),
levels = c("Did Not Survive", # manually rearranging ordering.
"0.98", "0.88", "0.51", "0.34", "0.22", "0.13")))
# Survival and Non-Survival Frequencies for each Categorical Pairing
genderclass.int.model_tib2 %>%
ggplot(aes(x = factor(round(predicted.prob, 2)), fill = SurvivalCategory)) +
geom_bar() +
labs(x = "Predicted Probabilities of each Categorical Pairing",
y = "Frequencies",
title = "Survival and Non-Survival Frequencies for each Categorical Pairing",
fill = "Legend") +
theme_bw() +
scale_fill_manual(values = c("Did Not Survive" = "grey30",
"0.13" = "cadetblue4",
"0.22" = "cadetblue3",
"0.34" = "cadetblue2",
"0.51" = "firebrick2",
"0.88" = "firebrick",
"0.98" = "firebrick4"),
labels = c("Did not Survive",
"Women 1st Class",
"Women 2nd Class",
"Women 3rd Class",
"Men 1st Class",
"Men 2nd Class",
"Men 3rd Class"))
## studentized.res count
## 1: -2.898002 3
## 2: -2.092894 6
## 3: 2.038359 47
Correlation
Matrix
cat('\n Correlation Matrix for each category of Embarked\n\n')
correlation_matrix[c(9,10,11),c(-2,-10,-11,-12)] %>% # excluding superfluous cols
as.data.table()
##
## Correlation Matrix for each category of Embarked
##
## term Survived Pclass Age SibSp Parch
## 1: Embark_Q 0.003650383 0.22100892 -0.02240479 -0.02635373 -0.08122810
## 2: Embark_C 0.168240431 -0.24329208 0.03626079 -0.05952822 -0.01106877
## 3: Embark_S -0.149682723 0.07405279 -0.02323278 0.06873359 0.06081361
## Fare Sex_F
## 1: -0.1172160 0.07411512
## 2: 0.2693347 0.08285347
## 3: -0.1621842 -0.11922375
The correlation matrix shows a possible effect on survivability from
port of embarkation, which will be worth investigating further. It also
shows a fair association with variable Pclass.
The first step is to investigate how much of the effect is just a
reflection of being associated with Pclass. How much variability is
explained by variable Embarked that is not already explained by
Pclass?
Secondly, there is no reasonable direct causality between the port at
which a passenger embarked and their survival outcome. But if an effect
remains after controlling for Pclass, then the influence from variable
‘Embarked’ will reflect an aggregate influence from other external
variables. Some of these, like Pclass, might also be in the dataset
(such as Sex), but others might not.
For instance, what if the location of the cabins for 3rd class
passengers depended on the port of embarkation? Those at the front of
the ship in the lower deck cabins likely had less time to react to the
impact. This could create an association between variables Embarked and
Survived.
In this sense, Embarked is different from the two preceding variables,
in that there is no meaningful theoretical groundings on which to argue
a causal relationship. But we’re still interested in the effect. If this
were a real study being conducted, an effect size in a variable such as
this might prompt further investigation in order to attempt to unveil
the actual underlying causes.
For my intended purposes here, however, I just need to be careful with
the interpretation.
Tabulated
Data
cat('\nTotal Counts per Port of Embarkation')
embark.counts <-
table(titanic$Embarked)
embark.counts
cat('\nProportion of Survivals per Port of Embarkation\n')
embark.survivals <- round(
prop.table(table(titanic$Embarked, titanic$Survived), margin = 1), 3)
embark.survivals[,2]
cat('\nOverall Proportion of Survivals in the sample')
round(
prop.table(table(titanic$Survived)), 3)
embark.survived.tbl <- table(titanic$Embarked, titanic$Survived)
CrossTable(embark.survived.tbl, fisher = F, chisq = T, expected = T, resid = T, sresid = T, asresid = T, format = "SPSS", digits = 2)
prop.test(x = 93, n = 168, p = 0.384, correct = FALSE)
prop.test(x = 30, n = 77, p = 0.384, correct = FALSE)
prop.test(x = 219, n = 646, p = 0.384, correct = FALSE)
##
## Total Counts per Port of Embarkation
## C Q S
## 168 77 646
##
## Proportion of Survivals per Port of Embarkation
## C Q S
## 0.554 0.390 0.339
##
## Overall Proportion of Survivals in the sample
## 0 1
## 0.616 0.384
##
## Cell Contents
## |-------------------------|
## | Count |
## | Expected Values |
## | Chi-square contribution |
## | Row Percent |
## | Column Percent |
## | Total Percent |
## | Residual |
## | Std Residual |
## | Adj Std Resid |
## |-------------------------|
##
## Total Observations in Table: 891
##
## |
## | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## C | 75 | 93 | 168 |
## | 103.52 | 64.48 | |
## | 7.86 | 12.61 | |
## | 44.64% | 55.36% | 18.86% |
## | 13.66% | 27.19% | |
## | 8.42% | 10.44% | |
## | -28.52 | 28.52 | |
## | -2.80 | 3.55 | |
## | -5.02 | 5.02 | |
## -------------|-----------|-----------|-----------|
## Q | 47 | 30 | 77 |
## | 47.44 | 29.56 | |
## | 0.00 | 0.01 | |
## | 61.04% | 38.96% | 8.64% |
## | 8.56% | 8.77% | |
## | 5.27% | 3.37% | |
## | -0.44 | 0.44 | |
## | -0.06 | 0.08 | |
## | -0.11 | 0.11 | |
## -------------|-----------|-----------|-----------|
## S | 427 | 219 | 646 |
## | 398.04 | 247.96 | |
## | 2.11 | 3.38 | |
## | 66.10% | 33.90% | 72.50% |
## | 77.78% | 64.04% | |
## | 47.92% | 24.58% | |
## | 28.96 | -28.96 | |
## | 1.45 | -1.84 | |
## | 4.47 | -4.47 | |
## -------------|-----------|-----------|-----------|
## Column Total | 549 | 342 | 891 |
## | 61.62% | 38.38% | |
## -------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 25.96445 d.f. = 2 p = 2.300863e-06
##
##
##
## Minimum expected frequency: 29.55556
##
##
## 1-sample proportions test without continuity correction
##
## data: 93 out of 168, null probability 0.384
## X-squared = 20.422, df = 1, p-value = 6.21e-06
## alternative hypothesis: true p is not equal to 0.384
## 95 percent confidence interval:
## 0.4780372 0.6267106
## sample estimates:
## p
## 0.5535714
##
##
## 1-sample proportions test without continuity correction
##
## data: 30 out of 77, null probability 0.384
## X-squared = 0.010246, df = 1, p-value = 0.9194
## alternative hypothesis: true p is not equal to 0.384
## 95 percent confidence interval:
## 0.2884225 0.5012893
## sample estimates:
## p
## 0.3896104
##
##
## 1-sample proportions test without continuity correction
##
## data: 219 out of 646, null probability 0.384
## X-squared = 5.528, df = 1, p-value = 0.01871
## alternative hypothesis: true p is not equal to 0.384
## 95 percent confidence interval:
## 0.3035530 0.3763689
## sample estimates:
## p
## 0.3390093
We may also conduct a one-sample proportion z-test to obtain the same
result. Given that squaring the standard normal distribution gives us
the chi-square distribution with one degree of freedom, the equality
\(\chi^2=Z^2\) or \(\sqrt{\chi^2}=Z\) is observed for the
chi-square and z-score statistics.
I’ll use the example above for the survivability of passengers who
embarked in Southampton to demonstrate this manually:
PROPORTIONS:
Embark_S(P(Survived=1)) = 219 / 646 = 0.339
Sample(P(Survived=1)) = 0.384
OBSERVED COUNTS:
Embark_S(\(O_{Survived=1}\)) =
219
Embark_S(\(O_{Survived=0}\)) = 646-219
= 427
EXPECTED COUNTS:
Embark_S(\(E_{Survived=1}\)) = 0.384 *
646 = 248.064
Embark_S(\(E_{Survived=0}\)) =
(1-0.384) * 646 = 397.936
So, if there were no relationship between survivability and having
embarked in Southampton, we would expect the observed values to
approximate the expected values. We already verified a statistically
significant dependency given by \(\chi^2(1)=5.528, p = 0.01871\).
Manually:
\(\chi^2_{Embarked.S} =
\frac{(219-248.064)^2}{248.064}+\frac{(427-397.936)^2}{397.936}=3.405+2.123=\textbf{5.528}\)
Taking the square root, \(\sqrt{5.528} =
\textbf{2.35117}\)
This value of about 2.35 is the size of the z-statistic we will obtain
if we conduct a one sample proportion z-test to check if the subset
proportion is significantly different from the sample proportion:
\(\Large Z=\frac{p_{subset} -
p_{null}}{\sqrt{\frac{p_{null}*(1-p_{null})}{n_{subset}}}}=\frac{0.339 -
0.384}{\sqrt{\frac{0.384*(1-0.384)}{646}}}\Large\)\(=\frac{-0.045}{0.01914}=\textbf{-2.3511}\)
\(Z^2=-2.3511^2=\textbf{5.528}\)
And the obtained p-values are the same:
# observed counts and expected proportions from the sample
observed <- c(219,427)
proportions <- c(0.384, 1 - 0.384)
chisq.test(x = observed, p = proportions)
cat('\nOne-Sample Proportion Z test (Two-tailed)\n')
cat('p-value = ',
2*pnorm(2.3511, lower.tail = F))
##
## Chi-squared test for given probabilities
##
## data: observed
## X-squared = 5.528, df = 1, p-value = 0.01871
##
##
## One-Sample Proportion Z test (Two-tailed)
## p-value = 0.018718
#Pearson Correlations
cat('\nPearson Correlations for Embarked with Survived, Pclass and Sex\n')
correlation_matrix[c(9,10,11),c(1,3,4,9)] %>%
as.data.table()
## PARTIAL CORRELATION
temp <- titanic %>% # getting errors from not coding as numeric
mutate(Survived = as.numeric(Survived),
Pclass = as.numeric(Pclass)) %>%
as.data.table()
cat('\nPartial Correlations for Embarked x Survived, Controlled for Pclass\n')
cat('Embark_Q:',pcor(c('Embark_Q', 'Survived', 'Pclass'), var(temp)))
cat('\nEmbark_C:',pcor(c('Embark_C', 'Survived', 'Pclass'), var(temp)))
cat('\nEmbark_S:',pcor(c('Embark_S', 'Survived', 'Pclass'), var(temp)))
cat('\n')
cat('\nPartial Correlations for Embarked x Survived, Controlled for Sex\n')
cat('Embark_Q:',pcor(c('Embark_Q', 'Survived', 'Sex_F'), var(temp)))
cat('\nEmbark_C:',pcor(c('Embark_C', 'Survived', 'Sex_F'), var(temp)))
cat('\nEmbark_S:',pcor(c('Embark_S', 'Survived', 'Sex_F'), var(temp)))
cat('\n')
cat('\nPartial Correlations for Embarked x Survived, Controlled for Pclass & Sex\n')
cat('Embark_Q:',pcor(c('Embark_Q', 'Survived', 'Pclass', 'Sex_F'), var(temp)))
cat('\nEmbark_C:',pcor(c('Embark_C', 'Survived', 'Pclass', 'Sex_F'), var(temp)))
cat('\nEmbark_S:',pcor(c('Embark_S', 'Survived', 'Pclass', 'Sex_F'), var(temp)))
##
## Pearson Correlations for Embarked with Survived, Pclass and Sex
## term Survived Pclass Sex_F
## 1: Embark_Q 0.003650383 0.22100892 0.07411512
## 2: Embark_C 0.168240431 -0.24329208 0.08285347
## 3: Embark_S -0.149682723 0.07405279 -0.11922375
##
## Partial Correlations for Embarked x Survived, Controlled for Pclass
## Embark_Q: 0.08549342
## Embark_C: 0.09410615
## Embark_S: -0.1327991
##
## Partial Correlations for Embarked x Survived, Controlled for Sex
## Embark_Q: -0.04374143
## Embark_C: 0.1472856
## Embark_S: -0.1018603
##
## Partial Correlations for Embarked x Survived, Controlled for Pclass & Sex
## Embark_Q: 0.03377889
## Embark_C: 0.0780645
## Embark_S: -0.08763114
I wasn’t sure whether adding an fpc factor to the wilson formula was a
valid approach, but this
paper seems to do just that, and also concludes it produces reliable
estimates at both low and large sample counts.
The main issue is that I’m relying on the assumption that sample to
population proportions are equally represented across the subsets
(excluding subsets with very small counts). Since this is intended for
exploratory purposes and I’ll also be graphing a wider interval using
the exact method, the potential inaccuracy shouldn’t be a problem.
The function below adds an fpc factor to the custom function
wilson_95ci(), created in the ‘Custom Functions’ section. The function
to calculate the fpc in turn can be found in the section ‘Variable Sex:
Standard Error and the FPC factor’.
The results of wilson_95ci() are compared with the results from
binom.confint() from the binom package, to guarantee wilson_95ci() is
working correctly prior to adding the fpc factor.
The function will be used to draw confidence intervals in the following
section.
### Check "Custom Functions" for wilson_95ci()
# Comparing wilson_95ci results with binom.confint(method="wilson")
cat('\nEstimated true probability given 17 successes in 42 trials\n')
test.a <- wilson_95ci(17,42)
test.b <- binom.confint(17,42, methods = "wilson")
list.a <- c(test.a[1],test.a[2],test.a[3])
list.b <- c(test.b$mean,test.b$lower,test.b$upper)
cat('\nComparing wilson_95ci results with binom.confint()\n')
all.equal(unname(list.a),list.b)
### WILSON SCORE 95% CONFIDENCE INTERVAL with FPC FACTOR
wilson_95ci.fpc <- function(successes,trials,fpc) {
prop <- successes/trials
# Standard Error
se <- sqrt((prop*(1-prop)/trials) + (1.96^2)/(4*(trials^2)))*fpc
# Adjust Proportion for Small Samples
prop.adj <- prop + (1.96^2)/(2*trials)
# 95% CI Step 1: Calculate numerator
Lower.CI <- (prop.adj - 1.96*se) / (1 + (1.96^2)/trials)
Upper.CI <- (prop.adj + 1.96*se) / (1 + (1.96^2)/trials)
results <- c(Mean = prop, Lower = Lower.CI, Upper = Upper.CI)
return(results)
}
##
## Estimated true probability given 17 successes in 42 trials
## Mean Lower Upper
## 0.4047619 0.2704265 0.5550595
##
## Comparing wilson_95ci results with binom.confint()
## [1] "Mean relative difference: 5.823213e-06"
Grouped
Distributions
# Summarize Counts and Survivability by Sex, Pclass, and Embarked
embark.summary <- titanic %>%
# Summarise counts by the 18 grouped categories
group_by(Sex, Pclass, Embarked) %>%
summarise(Count = n(),
Survivability = round(mean(Survived),3),
.groups = "drop")
# Grouped summary, Long format
embark.tbl.long <- embark.summary %>%
unite("Sex_Pclass", Sex, Pclass, sep = ".")
## Table with subgroup sample proportions
embark.tbl.wide <- embark.tbl.long %>%
select(-Count) %>%
pivot_wider(names_from = Sex_Pclass, values_from = Survivability)
cat('\nProportion of Survivals\n')
as.data.table(embark.tbl.wide)
### TABLE WITH EXACT AND WILSON 95% CI ESTIMATES
## Create new table with Successes and empty columns
embark.tbl.full <- embark.tbl.long %>%
# Unite Variables Embarked, Sex and Pclass into single column
unite("Emb_Sex_Cl", Embarked, Sex_Pclass, sep = ".") %>%
# Create Successes column and empty columns for the CI and p-value
mutate(Successes = round(Count*Survivability,0),
Binom_Low = NA,
Binom_Up = NA,
Binom_P = NA,
Wil95CI_Low = NA,
Wil95CI_Up = NA) %>%
# Rearrange column order
select(Emb_Sex_Cl, Successes, everything())
# .[,c(1,4,2,3,5:7)] @found simpler solution above
## Fill Columns with results from the exact and wilson+fpc methods
for (i in 1:nrow(embark.tbl.full)) {
# binomial test
results_bin <- binom.test(embark.tbl.full$Successes[i],
embark.tbl.full$Count[i],
p = 0.5)
# wilson+fpc
results_wil <- wilson_95ci.fpc(embark.tbl.full$Successes[i],
embark.tbl.full$Count[i],
titanic.fpc) # obtained w/ fpc custom func
# Store the resulting CI and p-values
embark.tbl.full$Binom_Low[i] <- round(results_bin$conf.int[1],3)
embark.tbl.full$Binom_Up[i] <- round(results_bin$conf.int[2],3)
embark.tbl.full$Binom_P[i] <- round(results_bin$p.value,3)
#
embark.tbl.full$Wil95CI_Low[i] <- round(results_wil[2],3)
embark.tbl.full$Wil95CI_Up[i] <- round(results_wil[3],3)
}
### HEAT MAP
## Plotting Survivability and Counts by Sex, Pclass, and Embarked
embark.tbl.long %>%
# Exclude estimates with low counts
mutate(Survivability = ifelse(Count < 3, NA, Survivability)) %>%
# Plot tilemap
ggplot(aes(x = Sex_Pclass, y = Embarked, fill = Survivability)) +
geom_tile(color = 'white') +
scale_fill_gradient(na.value = "grey80") +
labs(title = "Survivability & Counts by Sex, Pclass, and Embarked",
x = "Categorical Pairings of Sex and Pclass",
y = "Embarked",
fill = "Survivability") +
# Overlay Counts
geom_text(aes(label = Count), color = "white", size = 4) -> groups.hmap
### PLOT CONFIDENCE INTERVALS
embark.tbl.full %>%
# Exclude Low Count Categories (Greyed Out)
mutate(highlight_bi = ifelse(Count < 3, "exclude", "exact"),
highlight_wil =ifelse(Count < 3, "exclude", "wilson"),
highlight_pt =ifelse(Count < 3, "exclude", "prop")) %>%
# Order by Survivability
mutate(Emb_Sex_Cl = fct_reorder(Emb_Sex_Cl, Survivability)) %>%
# Plot Exact CI Intervals
ggplot(aes(x = Emb_Sex_Cl, y = Survivability)) +
geom_errorbar(aes(ymin = Binom_Low, ymax = Binom_Up,color = highlight_bi), width = 0.8, size = 1) +
# Overlap Wilson FPC intervals
geom_errorbar(aes(ymin = Wil95CI_Low, ymax = Wil95CI_Up,color = highlight_wil), width = 0.6, size = 1) +
# Points for the sample proportions
geom_point(aes(color = highlight_pt),size = 2) +
# Labels and Theme
labs(x = "Category", y = "Proportion", title = "Estimated Survivability for each Subgroup") +
#scale_color_identity() +
#theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
coord_flip() +
# Legend
scale_color_manual(values = c("exact" = "firebrick3",
"wilson" = "green4",
"prop" = "grey10",
"exclude" = "grey85"),
# Set Label Order
breaks = c("exact", "wilson", "prop", "exclude"),
# Name Labels
labels = c("Exact", "Wilson+FPC", "Sample Proportion", "Low Count"),
name = "95% Confidence Intervals") +
theme_bw() +
theme(legend.position = c(0.82, 0.32),
legend.background = element_rect(color = "black")) -> groups.cis
groups.hmap
groups.cis
##
## Proportion of Survivals
## Embarked female.1 female.2 female.3 male.1 male.2 male.3
## 1: C 0.977 1.00 0.652 0.405 0.200 0.233
## 2: Q 1.000 1.00 0.727 0.000 0.000 0.077
## 3: S 0.960 0.91 0.375 0.354 0.155 0.128
Main considerations:
1. 3rd class women who embarked in Queenstown were much more likely to
have survived than other 3rd class women, and this was a statistically
significant difference.
2. Passengers who embarked in Cherbourg had higher proportions of
survivals across all categories when compared with Southampton. Even
though the individual confidence intervals may not be significantly
different, the trend is consistent across all ‘C’ subgroups.
# Model
gen.cl.emb.model <- glm(Survived ~
Sex_F +
Pclass +
Sex_F*Pclass +
Sex_F*Embark_Q +
Embark_C,
data = titanic, family = binomial())
cat('\nADDING EMBARKED TO THE MODEL\n')
summary(gen.cl.emb.model)
cat('\n')
anova(genderclass.int.model,gen.cl.emb.model)
cat('\n\nChi-Square test (chi: 18.02, df:3)\n')
cat('p =',1 - pchisq(18.02, 3))
# Create new tibble with fitted values and diagnostic data
model_tib(gen.cl.emb.model, titanic, PassengerId, Survived, Sex, Pclass,Embarked)
# Summarise counts by the 18 grouped categories
gen.cl.emb.model_summary <- gen.cl.emb.model_tib %>%
group_by(Sex, Pclass, Embarked, round(predicted.prob,3)) %>%
summarise(Sample_Prop = round(mean(Survived),3),
.groups = "drop") %>%
# Collapse Embarked,Sex and Pclass into a single variable
unite("Emb_Sex_Cl", Embarked, Sex, Pclass, sep = ".") %>%
rename(Pred_Prob = 'round(predicted.prob, 3)')
#Convert to Long Format for Side-by-Side Barplot
gen.cl.emb.model_long <- gen.cl.emb.model_summary %>%
pivot_longer(cols = c(Pred_Prob, Sample_Prop),
names_to = "Type", values_to = "Survivability")
# Create the barplot
gen.cl.emb.model_long %>%
# Reorder Emb_Sex_Cl in descending order of Survivability
mutate(Emb_Sex_Cl = fct_reorder(Emb_Sex_Cl, -Survivability)) %>%
# Plot
ggplot(aes(x = Emb_Sex_Cl, y = Survivability, fill = Type)) +
# Side-by-side barplots with 'dodge'
geom_bar(stat = "identity", position = "dodge", width = 0.7) +
# Theme and Labels
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Subgroups of Sex, Pclass and Embarked",
y = "Survivability",
title = 'Model Predictions vs Sample Proportions') +
scale_fill_manual(values = c("Pred_Prob" = "firebrick2",
"Sample_Prop" = "grey50"),
labels = c("Pred_Prob" = "Model",
"Sample_Prop" = "Sample")) +
theme_bw() +
theme(legend.position = c(0.9, 0.78),
legend.background = element_rect(color = "black"),
axis.text.x = element_text(angle = 90,
hjust = 1,
vjust = 0.3))
##
## ADDING EMBARKED TO THE MODEL
##
## Call:
## glm(formula = Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F *
## Embark_Q + Embark_C, family = binomial(), data = titanic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3050 0.3055 -0.998 0.31817
## Sex_F 6.5965 0.9233 7.144 9.04e-13 ***
## Pclass -0.5523 0.1282 -4.308 1.65e-05 ***
## Embark_Q -0.6358 0.6201 -1.025 0.30526
## Embark_C 0.6309 0.2256 2.796 0.00517 **
## Sex_F:Pclass -1.6588 0.3421 -4.849 1.24e-06 ***
## Sex_F:Embark_Q 1.9712 0.7550 2.611 0.00903 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.7 on 890 degrees of freedom
## Residual deviance: 785.1 on 884 degrees of freedom
## AIC: 799.1
##
## Number of Fisher Scoring iterations: 6
##
##
## Analysis of Deviance Table
##
## Model 1: Survived ~ Sex_F + Pclass + Sex_F * Pclass
## Model 2: Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F * Embark_Q +
## Embark_C
## Resid. Df Resid. Dev Df Deviance
## 1 887 803.12
## 2 884 785.10 3 18.02
##
##
## Chi-Square test (chi: 18.02, df:3)
## p = 0.0004356917
The new model is a significant improvement over the previous one. A
likelihood ratio of 18.02 is a significant statistic for a
chi-distribution with 3 degrees of freedom (p-value < 0.01).
The model is fitting the data fairly well, and we can expect the
accuracy of the model to improve somewhat given the <0.5 probability
for category S.female.3.
Applying the model to
the test dataset
Finally, I’ll generate the predicted probabilities and binary survival
outcome for the test data:
test.backup <- titanic.test
# Add recoded variables to titanic.test
titanic.test$Sex_F <- ifelse(titanic.test$Sex=="male",0,1)
titanic.test <- titanic.test %>% # Dummy variables for embarked,
mutate(Embark_Q = ifelse(Embarked == 'Q',1,0),
Embark_C = ifelse(Embarked == 'C',1,0),
Embark_S = ifelse(Embarked == 'S',1,0))
# Model Predicted Probability
titanic.test <- titanic.test %>%
mutate(Pred_Prob = predict(gen.cl.emb.model, newdata = titanic.test, type = "response"))
# Predict Binary Outcome
titanic.test$Survived <- ifelse(titanic.test$Pred_Prob > 0.5, 1, 0)
# Create File for Export
gen.cl.emb_sub <- titanic.test %>%
select(PassengerId, Survived)
# write_csv(gen.cl.emb_sub, "Kaggle/gen.cl.emb_sub.csv")
Frequency
Table
cat('\nBasic Descriptives\n')
summary(titanic$Age)
cat('\n')
cat('\nFrequencies of each value of Age\n')
table(titanic$Age)
##
## Basic Descriptives
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.42 20.12 28.00 29.70 38.00 80.00 177
##
##
## Frequencies of each value of Age
##
## 0.42 0.67 0.75 0.83 0.92 1 2 3 4 5 6 7 8 9 10 11
## 1 1 2 2 1 7 10 6 10 4 3 3 4 8 2 4
## 12 13 14 14.5 15 16 17 18 19 20 20.5 21 22 23 23.5 24
## 1 2 6 1 5 17 13 26 25 15 1 24 27 15 1 30
## 24.5 25 26 27 28 28.5 29 30 30.5 31 32 32.5 33 34 34.5 35
## 1 23 18 18 25 2 20 25 2 17 18 2 15 15 1 18
## 36 36.5 37 38 39 40 40.5 41 42 43 44 45 45.5 46 47 48
## 22 1 6 11 14 13 2 6 13 5 9 12 2 3 9 9
## 49 50 51 52 53 54 55 55.5 56 57 58 59 60 61 62 63
## 6 10 7 6 1 8 2 1 4 2 5 2 4 3 4 2
## 64 65 66 70 70.5 71 74 80
## 2 3 1 2 1 2 1 1
Of note:
Missing Values: There are 177 missing values, which correspond to
roughly 20% of the total.
I’ll be using a multiple imputation method to handle the missing data,
but afterwards I will want to check the patterns manually. If I cannot
develop a good theoretical grounding to justify the imputed values, I’d
rather transform Age into a categorical variable and extract some useful
information that way (such as a dummy variable for the very young and
another for the very old).
Fractional values: There are 7 cases with fractional values under
1. This is of no practical advantage (it sounds bizarre to propose that
a child of 4 months had a higher survivability than another of 8 months,
and in any case there aren’t enough cases to test this). These cases
will be recoded as 0.5, to keep them all as their own category, separate
from Age 1.
On top of that, there are quite a few “half ages”, which seems an
unnecessary nuisance. Any number ending in ‘.5’ will be recoded as the
integer immediately preceding it (excluding the newly recoded ages under
1).
Descriptive
Plots
### Distribution of Total and Survival Counts by Age
titanic %>%
ggplot(aes(Age)) +
geom_histogram(data = titanic, aes(fill = "Did not Survive")) +
geom_histogram(data = subset(titanic, Survived == 1), aes(fill = "Survived"), alpha = 0.8) +
scale_fill_manual(name = "Legend",
values = c("Did not Survive" = "grey30",
"Survived" = "firebrick2")) +
labs(y = "Frequencies",
title = "Distribution of Total and Survival Counts by Age") +
theme_bw() +
theme(legend.position = c(0.85, 0.80),
legend.background = element_rect(color = "black"))
### Estimate of Sample Survival Probability by Age
titanic %>%
ggplot(aes(Age, Survived)) +
labs(x = "Age", y = "Survivability = P(Survived)",
title = 'Estimated Sample Survivability by Age') +
geom_smooth(method = 'loess', span = 0.8, colour='firebrick2') +
scale_x_continuous(expand = c(0, 0)) + # removes padding
coord_cartesian(ylim = c(0,1)) +
theme_bw()
# Inspecting average survivability by Age. (Will not print)
age.survived <- titanic %>%
filter(!is.na(Age)) %>%
group_by(Age) %>%
summarise(Survivability = mean(Survived),
Counts = n()) %>%
view()
The histogram makes it clearer that the frequency of child passengers
and of elderly passengers is fairly low. The bulk of cases seems to fall
roughly within Ages 20 to 40 (The IQR is [20,38]).
The second plot provides an approximation of survivability in the sample
for each value of Age (the shaded area gives the 95% confidence
interval), which helps clarify the relationship.
As I hypothesized, survivability is much higher for younger ages but
falls rapidly and seems to stabilize for a while (before dropping
again).
However, unlike my initial expectation, the trend doesn’t smoothly
stabilize around the ages 14-16. Instead, it keeps dropping, falling to
its lowest survivability at around the age of 20 before going up again,
and then it doesn’t reach such a low depth again until around the age of
60.
What may be causing a lower than average survivability for young
adults?
My suspicion is that this particular age bracket is highly correlated
with being a male and/or a 3rd class passenger, and so that’s what I’ll
check first when I analyse the bivariate relationships.
Finally, the very low amount of observations past age 60, and
particularly the single observation at the age of 80, is causing the
confidence interval to become very wide. Although the general trend of
lower survivability among the elderly is likely real, it’s not
discernible from the sample if this is a steep difference or a moderate
one, compared to other adults.
### TIDYING VARIABLE AGE ###
# Recoding all fractional values of Age as their preceding whole number
titanic <- titanic %>%
mutate(Age = floor(Age))
# Recoding fractional values under 1 as 0.5 [converted to zero above]
titanic <- titanic %>%
mutate(Age = ifelse(Age == 0, 0.5, Age))
# Creating dummy variable for the missing values of Age
titanic <- titanic %>%
mutate(Age.NA = as.numeric(is.na(titanic$Age)))
I want to look at a set of baseline predictive models of Survived ~ Age
for the 714 valid cases: a linear, an exponential and a piecewise model.
Later, when I add Age to the main model on the full dataset with imputed
values, I’ll want to check these models again.
# Subset of Data without NAs
titanic.na.rm <- titanic %>%
filter(!is.na(Age))
# Linear Model
age.model.lin <- glm(Survived ~ Age, data = titanic.na.rm, family = binomial())
# Exponential model
age.model.exp <- glm(Survived ~ log(Age), data = titanic.na.rm, family = binomial())
# New variables with predicted survivability assuming a linear and an exponential relationship
titanic.na.rm$Lin.Age <- predict(age.model.lin, type = "response")
titanic.na.rm$Exp.Age <- predict(age.model.exp, type = "response")
### PIECEWISE MODEL: breaking the data into three sections
titanic.na.rm <- titanic.na.rm %>%
mutate(
PM.Regime = ifelse(Age <= 20, "Exp",
ifelse(Age >20 & Age <= 45, "Lin.Mid",
"Lin.Late")))
# Modeling each section separately
age_exp <- glm(Survived ~ log(Age), data = filter(titanic.na.rm, PM.Regime == "Exp"), family = binomial())
age_lin1 <- glm(Survived ~ Age, data = filter(titanic.na.rm, PM.Regime == "Lin.Mid"), family = binomial())
age_lin2 <- glm(Survived ~ Age, data = filter(titanic.na.rm, PM.Regime == "Lin.Late"), family = binomial())
# Predicting each section into a new "Piecewise" variable
titanic.na.rm <- titanic.na.rm %>%
mutate(
Piecewise = ifelse(PM.Regime == "Exp",
predict(age_exp, newdata = titanic.na.rm,
type = "response"),
ifelse(PM.Regime == "Lin.Mid",
predict(age_lin1, newdata = titanic.na.rm, type = "response"),
predict(age_lin2, newdata = titanic.na.rm, type = "response"))))
# Modelling Survived predicted by the new piecewise variable
age.piecewise <- glm(Survived ~ Piecewise, data = titanic.na.rm, family = binomial())
## DATA VS BASELINE MODELS
titanic.na.rm %>%
ggplot(aes(Age, Survived)) +
labs(x = "Age", y = "Survivability",
title = "Data Trend vs Baseline Predictive Models") +
# Data Trend
geom_smooth(method = 'loess',
se = FALSE,
span = 0.8,
linetype = 'dashed',
aes(colour='Data Trend')) +
# Linear Model
geom_line(aes(y = Lin.Age, color = "Linear Model"),
size = 1) +
# Exponential Model
geom_line(aes(y = Exp.Age, color = "Exponential Model"),
size = 1) +
# Piecewise Model
geom_line(aes(y = Piecewise, color = "Piecewise Model"),
size = 1) +
# Plot Settings
scale_x_continuous(limits = c(0, 80), expand = c(0, 0)) +
coord_cartesian(ylim = c(0,1)) +
theme_bw() +
# Legend Settings
scale_color_manual(values = c("Data Trend" = "firebrick2",
"Linear Model" = "royalblue1",
"Exponential Model" = "orange",
"Piecewise Model" = "green")) +
theme(legend.position = c(0.85, 0.75),
legend.background = element_rect(color = 'black')) +
labs(color = "Model Type")
cat('\nLINEAR MODEL\n')
summary(age.model.lin)
cat('\n\n')
anova(age.model.lin,age.model.exp,age.piecewise)
##
## LINEAR MODEL
##
## Call:
## glm(formula = Survived ~ Age, family = binomial(), data = titanic.na.rm)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.058690 0.173500 -0.338 0.7352
## Age -0.010902 0.005329 -2.046 0.0408 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 964.52 on 713 degrees of freedom
## Residual deviance: 960.28 on 712 degrees of freedom
## AIC: 964.28
##
## Number of Fisher Scoring iterations: 4
##
##
##
## Analysis of Deviance Table
##
## Model 1: Survived ~ Age
## Model 2: Survived ~ log(Age)
## Model 3: Survived ~ Piecewise
## Resid. Df Resid. Dev Df Deviance
## 1 712 960.28
## 2 712 952.52 0 7.7564
## 3 712 945.40 0 7.1162
Unsurprisingly the linear model isn’t a very good fit. The exponential
model is an improvement but still not good enough. Of the three baseline
models, the piecewise model, which splits the model into an exponential
model for ages below 20, and two linear models, one for ages 21 to 45
and another for ages >45, is the one that shows the best model
fit.
Next I want to compare the coefficients and pseudo R^2s of the current
model when applied to the full data set vs when applied to a subset
which excludes the rows with missing values in variable Age. It should
be a good sign if they are identical, as it should indicate that
missingness in variable Age isn’t sufficiently correlated with the
remaining predictors to the point of extensively altering the proportion
of explained deviance.
# Current Model applied to (Age != NA) subset
no.na.model <- glm(formula = Survived ~
Sex_F + Pclass +
Sex_F * Pclass +
Sex_F * Embark_Q +
Embark_C,
family = binomial(), data = titanic.na.rm)
cat('\nPREDICTOR COEFFICIENTS ON PARTIAL DATASET (Age NAs excluded)\n')
summary(no.na.model)
cat('\n\nPSEUDO R^2s\n')
cat('\nPseudo R^2s for FULL DATASET\n')
log_pseudoR2s(gen.cl.emb.model)
cat('\nPseudo R^2s for PARTIAL DATASET\n')
log_pseudoR2s(no.na.model)
##
## PREDICTOR COEFFICIENTS ON PARTIAL DATASET (Age NAs excluded)
##
## Call:
## glm(formula = Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F *
## Embark_Q + Embark_C, family = binomial(), data = titanic.na.rm)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.2573 0.3376 -0.762 0.445981
## Sex_F 6.2875 0.9367 6.712 1.91e-11 ***
## Pclass -0.5412 0.1425 -3.798 0.000146 ***
## Embark_Q -0.9710 1.0482 -0.926 0.354284
## Embark_C 0.6607 0.2577 2.564 0.010339 *
## Sex_F:Pclass -1.5415 0.3524 -4.374 1.22e-05 ***
## Sex_F:Embark_Q 1.2372 1.2329 1.003 0.315656
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 964.52 on 713 degrees of freedom
## Residual deviance: 641.72 on 707 degrees of freedom
## AIC: 655.72
##
## Number of Fisher Scoring iterations: 6
##
##
##
## PSEUDO R^2s
##
## Pseudo R^2s for FULL DATASET
## Pseudo R^2 for logistic regression
## Hosmer and Lemeshow R^2 0.338
## Cox and Snell R^2 0.363
## Nagelkerke R^2 0.493
##
## Pseudo R^2s for PARTIAL DATASET
## Pseudo R^2 for logistic regression
## Hosmer and Lemeshow R^2 0.335
## Cox and Snell R^2 0.364
## Nagelkerke R^2 0.491
Summary Tables &
Density Plots
cat('\nCorrelation Matrix\n')
correlate(titanic,use = "pairwise.complete.obs")[4,c(1,3,4,6:12)] %>%
as.data.table()
cat('\nMean Age per Sex\n')
# Mean Age per Sex
round(
with(titanic,
tapply(Age,Sex,mean, na.rm = TRUE)), 1)
cat('\nMean Age per Pclass\n')
# Mean Age per Pclass
round(
with(titanic,
tapply(Age,Pclass,mean, na.rm = TRUE)), 1)
cat('\nMean Age per Embarked\n')
# Mean Age per Pclass
round(
with(titanic,
tapply(Age,Embarked,mean, na.rm = TRUE)), 1)
cat('\nMean and Std Deviation of Age per Subgroup\n')
titanic %>%
group_by(Sex, Pclass,Embarked) %>% # Group by Sex, Pclass and Embarked
summarise(Mean_Age = round(mean(Age, na.rm = TRUE), 2),
Std_Dev = round(sd(Age, na.rm = TRUE), 2),
Count = n()) %>%
unite("Emb_Sex_Cl", Embarked, Sex, Pclass, sep = ".") %>%
as.data.table()
##
## Correlation Matrix
## term Survived Pclass SibSp Parch Fare Sex_F
## 1: Age -0.07679579 -0.3696827 -0.3079811 -0.189098 0.09637049 -0.09287257
## Embark_Q Embark_C Embark_S
## 1: -0.02294232 0.03589134 -0.02263798
##
## Mean Age per Sex
## female male
## 27.9 30.7
##
## Mean Age per Pclass
## 1 2 3
## 38.2 29.9 25.1
##
## Mean Age per Embarked
## C Q S
## 30.8 28.0 29.5
##
## Mean and Std Deviation of Age per Subgroup
## Emb_Sex_Cl Mean_Age Std_Dev Count
## 1: C.female.1 36.05 13.06 43
## 2: Q.female.1 33.00 NA 1
## 3: S.female.1 33.46 14.23 50
## 4: C.female.2 19.14 8.71 7
## 5: Q.female.2 30.00 NA 2
## 6: S.female.2 29.71 12.97 67
## 7: C.female.3 14.00 11.70 23
## 8: Q.female.3 22.80 8.12 33
## 9: S.female.3 23.22 12.96 88
## 10: C.male.1 40.11 15.30 42
## 11: Q.male.1 44.00 NA 1
## 12: S.male.1 41.88 15.26 79
## 13: C.male.2 25.88 10.83 10
## 14: Q.male.2 57.00 NA 1
## 15: S.male.2 30.86 14.91 97
## 16: C.male.3 24.94 9.66 43
## 17: Q.male.3 28.07 20.90 39
## 18: S.male.3 26.56 11.69 265
### Boxplot distribution of Age per Subgroup
titanic %>%
ggplot(aes(x = interaction(Embarked, Sex, Pclass), y = Age, fill = factor(Pclass))) +
geom_boxplot() +
labs(x = "Subgroups", y = "Age",
title = "Age Distribution per Subgroup") +
scale_fill_manual(values = c("firebrick2",
"cadetblue3",
"darkolivegreen3"),
name = "Pclass") +
theme_bw() +
coord_flip() -> age.groups
age.groups
### Age Distribution per Sex
titanic %>%
ggplot(aes(x = Age, fill = Sex)) +
geom_density(alpha = 0.6) +
labs(x = "Age", y = "Density",
title = "Age Distribution per Sex") +
scale_fill_manual(values = c("firebrick2","cadetblue3")) +
theme_bw()
### Age Distribution per Pclass
titanic %>%
ggplot(aes(x = Age, fill = factor(Pclass))) +
geom_density(alpha = 0.6) +
labs(x = "Age", y = "Density",
title = "Age Distribution per Pclass") +
scale_fill_manual(values =
c("firebrick2",
"cadetblue3",
"darkolivegreen3"),
name = "Pclass") +
theme_bw()
### Age Distribution per Embarked
titanic %>%
ggplot(aes(x = Age, fill = Embarked)) +
geom_density(alpha = 0.6) +
labs(x = "Age", y = "Density",
title = "Age Distribution per Embarked") +
scale_fill_manual(values =
c("firebrick2",
"cadetblue3",
"darkolivegreen3")) +
theme_bw()
The correlation matrix shows that Age is strongly correlated with Pclass
(-0.37). The mean age for Pclass 1 is 38.2, against 25.1 for Pclass
3.
There is one particular oddity revealed by the boxplots of Age per each
of the 18 subgroups of Sex,Pclass and Embarked: C.female.3 has a much
lower median and IQR than any other category, and C.female.2 also has a
much lower median and IQR than other 2nd Class Subgroups. At 30
observations for a sample which constitutes 68% of the total population,
it is likely we can infer 3rd and 2nd class women who embarked in
Cherbourg were younger than average. But it’s not worth testing
it.
As for the density plots, they show that:
- As expected from the correlation value, Pclass is by far the predictor
most associated with Age. The peak of the density curve is around 20 for
3rd Class, 30 for 2nd class and almost 40 for 1st class, which is
roughly in line with the calculated mean ages. The curves for 2nd and
3rd class are much narrower, with values mostly concentrated between
ages 20 and 40, whereas the curve for 1st class shows the lowest density
for very young ages and has a platykurtic shape with cases distributed
pretty evenly around 40 +/- 20.
- Women are somewhat more represented in ages 0 to 20, and somewhat less
so between ages 20 to 40 and 60 to 80;
- Queenstown passengers have the highest representation among ages 0 to
20, and the lowest in ages 40 to 60; Cherbourg passengers have the
highest representation in ages 40 to 60 (likely a consequence of having
a higher than average proportion of 1st Class passengers).
Survivability &
Age: Subgroup Analysis
The curve of the relation between P(Survived) and Age, plotted at the
beginning of the chapter, showed an unexpected valley around age 20.
From the information gathered from the density plots, if we repeat the
process using the same level of smoothness but this time drawing
different lines for different subgroups, the valley should disappear for
subgroups which are neither female nor 3rd class:
# Sample Survivability by Age, for each Sex
titanic %>%
ggplot(aes(Age, Survived)) +
labs(x = "Age", y = "P(Survived)",
title = 'Sample Survivability by Age, per Sex') +
geom_smooth(method = 'loess',
span = 0.8,se = T, level = 0.90,
alpha = 0.05,
aes(color = Sex, fill = Sex)) +
scale_x_continuous(expand = c(0, 0)) + # removes padding
coord_cartesian(ylim = c(0,1)) +
scale_color_manual(values = c('male' = 'royalblue3',
'female' = 'firebrick2')) +
scale_fill_manual(values = c('male' = 'royalblue3',
'female' = 'firebrick2')) +
theme_bw() -> plot.age.sex
# Sample Survivability by Age, for each Class
titanic %>%
ggplot(aes(Age, Survived)) +
labs(x = "Age", y = "P(Survived)",
title = 'Sample Survivability by Age, per PClass') +
geom_smooth(method = 'loess',
span = 0.8,se = T, level = 0.90,
alpha = 0.05,
aes(color = factor(Pclass),
fill = factor(Pclass))) +
scale_x_continuous(expand = c(0, 0)) + # removes padding
coord_cartesian(ylim = c(0,1)) +
scale_color_manual(values = c('1' = 'firebrick2',
'2' = 'royalblue3',
'3' = 'darkolivegreen3'),
name = 'Pclass') +
scale_fill_manual(values = c('1' = 'firebrick2',
'2' = 'royalblue3',
'3' = 'darkolivegreen3'),
name = 'Pclass') +
theme_bw() -> plot.age.class
plot.age.sex + plot.age.class
Note: these sort of plots are just rough guides of the general trend
in the data. They can be easily manipulated by altering the level of
smoothness being applied.
Nevertheless, keeping the same level of smoothness as the original plot
(0.8), the valley at around age 20 disappears for first class and female
passengers (though it’s still present for 2nd class passengers).
In addition, it seems my initial hypothesized relationship between
Survivability and Age - very high survivability for younger ages,
exponentially decaying until stabilizing around some value for adults -,
is a fairly appropriate model of the data when only males are
considered; but it is awfully inadequate to describe the relationship
for the subset of female passengers. Survivability is so
disproportionately higher for women vs men that age doesn’t even affect
it when considered as a group. Not only does survivability not drop with
age, it increases with age, consequence of the higher survivability of
1st class passengers, who as we saw are quite older on average.
I’d also like to verify the following:
a) is age a factor in survivability for women when we remove the class
factor, in other words, is the drop in survivability also present for
3rd class women?
b) What does the curve in survivability look like for 1st and 2nd class
men, when women are factored out?
# Sample Survivability by Age among Women, per Pclass
titanic %>%
filter(Sex == 'female') %>%
filter(!PassengerId %in% c(298, 773, 484)) %>%
ggplot(aes(Age, Survived)) +
labs(x = "Age", y = "P(Survived)",
title = 'Sample Survivability by Age (Women)') +
geom_smooth(method = 'loess',
span = 0.8,se = T, level = 0.90,
alpha = 0.05,
aes(color = factor(Pclass),
fill = factor(Pclass))) +
scale_x_continuous(expand = c(0, 0)) + # removes padding
coord_cartesian(ylim = c(0,1)) +
scale_color_manual(values = c('1' = 'firebrick2',
'2' = 'royalblue3',
'3' = 'darkolivegreen3'),
name = 'Pclass') +
scale_fill_manual(values = c('1' = 'firebrick2',
'2' = 'royalblue3',
'3' = 'darkolivegreen3'),
name = 'Pclass') +
theme_bw() -> plot.w.age.class
# Sample Survivability by Age among Men, per PClass
titanic %>%
filter(Sex == 'male') %>%
filter(!PassengerId %in% c(571,631)) %>%
ggplot(aes(Age, Survived)) +
labs(x = "Age", y = "P(Survived)",
title = 'Sample Survivability by Age (Men)') +
geom_smooth(method = 'loess',
span = 0.8,se = T, level = 0.90,
alpha = 0.05,
aes(color = factor(Pclass),
fill = factor(Pclass))) +
scale_x_continuous(expand = c(0, 0)) + # removes padding
coord_cartesian(ylim = c(0,1)) +
scale_color_manual(values = c('1' = 'firebrick2',
'2' = 'royalblue3',
'3' = 'darkolivegreen3'),
name = 'Pclass') +
scale_fill_manual(values = c('1' = 'firebrick2',
'2' = 'royalblue3',
'3' = 'darkolivegreen3'),
name = 'Pclass') +
theme_bw() -> plot.m.age.class
plot.w.age.class + plot.m.age.class
Variable Name contains the titles Master and Miss (Master is the title
granted to young men back then, which I was not aware of until I saw
this dataset), as well as Mr and Mrs. It also contains a few other
cases, such as Dr. and Rev, which are always for 1st and 2nd class
passengers.
I want to create this variable prior to imputation to reduce the chance
of older ages being attributed to passengers with title ‘Master’, and
younger ages to ‘Mr’ and ‘Mrs’.
### NEW VARIABLE 'TITLE', extracted from variable Name
titanic <- titanic %>%
mutate(Title = str_extract(Name, "Master|Miss|Mrs|Mr")) %>%
# Recode NAs as 'High_Status' (They all are. E.g. Drs, Revs, Mme, etc)
mutate(Title = ifelse(is.na(Title), 'High_Status', Title)) %>%
# Convert to factor
mutate(Title = as.factor(Title)) %>%
# Reorder
mutate(Title = fct_relevel(Title, "Master", "Miss", "Mr", "Mrs", "High_Status"))
# Plot
titanic %>%
filter(!Title %in% 'High_Status') %>% # exclude High Status cases
ggplot(aes(Age, fill = Title)) +
geom_density(alpha = 0.4) +
labs(x = "Age", y = "Density",
title = "Age Distribution per Title") +
scale_fill_manual(values =
c("firebrick2",
"cadetblue3",
"darkolivegreen3",
"orange")) +
theme_bw() -> title.dist
title.dist
Descriptive
Statistics
correlate(titanic,use = "pairwise.complete.obs")[12,c(1,3,4,6:12)] %>%
as.data.table()
cat('\nTable for Age.NA x Survived')
with(titanic,
table(Age.NA, Survived))
cat('\nTable for Age.NA x Pclass\n')
with(titanic,
table(Age.NA, Pclass))
cat('\nProportions of NAs per Pclass\n')
round(
with(titanic,
prop.table(table(Age.NA, Pclass), margin = 2)), 2)
cat('\n')
cat('\nTable for Age.NA x Embarked\n')
with(titanic,
table(Age.NA, Embarked))
cat('\nProportions of NAs per Embarked\n')
round(
with(titanic,
prop.table(table(Age.NA, Embarked), margin = 2)), 2)
cat('\nTable of Counts, Total NAs and Proportion of NAs, per Subgroup\n')
titanic %>%
group_by(Embarked, Sex, Pclass) %>%
summarise(Counts = n(),
NAs = sum(Age.NA),
NA.Prop = round(NAs/Counts,3)) %>%
unite("Emb_Sex_Cl", Embarked, Sex, Pclass, sep = ".") %>%
arrange(desc(NAs)) %>%
as.data.table() -> na.counts
na.counts
# Barplot of Total Counts
na.barplot <- na.counts %>%
# Midstep to color by Pclass
mutate(Pclass = str_extract(Emb_Sex_Cl, "3|2|1")) %>%
# Recode Emb_Sex_Cl as factor
mutate(Emb_Sex_Cl = as.factor(Emb_Sex_Cl)) %>%
# Reorder in descending order of NAs
mutate(Emb_Sex_Cl = fct_reorder(Emb_Sex_Cl, desc(NAs))) %>%
# Plot Total Counts
ggplot(aes(x = Emb_Sex_Cl, y = Counts)) +
geom_bar(stat = "identity") +
# Plot NA Counts
geom_bar(aes(y = NAs, fill = as.factor(Pclass)),
stat = "identity") +
scale_fill_manual(values = c("firebrick2",
"cadetblue3",
"darkolivegreen3")) +
labs(fill = "Pclass", x = "Subgroups", y = "Counts",
title = "Age.NAs & Total Counts, per Subgroup") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.3)) +
# Show NA Count as Text
geom_text(aes(y = 20, label = NAs),
color = "black", size = 4.5) +
# Show Proportion of NAs as Text
geom_text(aes(y = 6, label = paste0("(",round(NA.Prop, 2),")")),
color = "black", size = 3)
## term Survived Pclass SibSp Parch Fare Sex_F
## 1: Age.NA -0.09219652 0.1729329 0.01895757 -0.1241038 -0.1007071 -0.05521512
## Embark_Q Embark_C Embark_S
## 1: 0.3374132 0.03326967 -0.24148
##
## Table for Age.NA x Survived Survived
## Age.NA 0 1
## 0 424 290
## 1 125 52
##
## Table for Age.NA x Pclass
## Pclass
## Age.NA 1 2 3
## 0 186 173 355
## 1 30 11 136
##
## Proportions of NAs per Pclass
## Pclass
## Age.NA 1 2 3
## 0 0.86 0.94 0.72
## 1 0.14 0.06 0.28
##
##
## Table for Age.NA x Embarked
## Embarked
## Age.NA C Q S
## 0 130 28 556
## 1 38 49 90
##
## Proportions of NAs per Embarked
## Embarked
## Age.NA C Q S
## 0 0.77 0.36 0.86
## 1 0.23 0.64 0.14
##
## Table of Counts, Total NAs and Proportion of NAs, per Subgroup
## Emb_Sex_Cl Counts NAs NA.Prop
## 1: S.male.3 265 51 0.192
## 2: Q.male.3 39 25 0.641
## 3: Q.female.3 33 23 0.697
## 4: C.male.3 43 18 0.419
## 5: S.male.1 79 15 0.190
## 6: S.female.3 88 12 0.136
## 7: C.female.3 23 7 0.304
## 8: S.male.2 97 7 0.072
## 9: C.male.1 42 6 0.143
## 10: C.female.1 43 5 0.116
## 11: S.female.1 50 4 0.080
## 12: C.male.2 10 2 0.200
## 13: Q.female.2 2 1 0.500
## 14: S.female.2 67 1 0.015
## 15: C.female.2 7 0 0.000
## 16: Q.female.1 1 0 0.000
## 17: Q.male.1 1 0 0.000
## 18: Q.male.2 1 0 0.000
I’ll use multiple imputation to generate 5 different sets of imputed
values. I’ll first compare the overall results of the default predictive
mean matching (pmm) method vs a random forest method, and proceed with
the method that returns the most plausible results.
I’ll then analyse the imputed sets against the existing sample data. The
main difference between the Age NAs and the remaining sample is that
they are more likely to be 3rd class than the sample (and also more
likely to be from Queenstown than the sample, but the main
characteristic of Queenstown passengers is that they are almost
exclusively 3rd Class).
The values we want to replace with the NAs should reflect these
biases.
### USING RANDOM FOREST METHOD
# imputed_dat.rf <- mice(titanic, m=5, method='rf') [DO NOT RERUN]
# saveRDS(imputed_dat.rf, "Kaggle/imputed_dat.rf.rds")
# Load Data
imputed_dat.rf <- readRDS("Kaggle/imputed_dat.rf.rds")
# Plot Data
imputed.age.rf <- as_tibble(imputed_dat.rf$imp$Age)
ggplot() +
# Imputed Distributions
geom_density(aes(x = imputed.age.rf$'1', color = 'Set 1')) +
geom_density(aes(x = imputed.age.rf$'2', color = 'Set 2')) +
geom_density(aes(x = imputed.age.rf$'3', color = 'Set 3')) +
geom_density(aes(x = imputed.age.rf$'4', color = 'Set 4')) +
geom_density(aes(x = imputed.age.rf$'5', color = 'Set 5')) +
# Overlay with Sample Age Distribution
geom_density(aes(x = titanic$Age, color='Sample'),
size = 1.2,
linetype = 'dashed') +
labs(title = "Imputed Sets (RF Method)",
x = "Age",
y = "Density",
color = "Legend") +
scale_color_manual(values = c("Set 1" = "blue",
"Set 2" = "deeppink2",
"Set 3" = "green",
"Set 4" = "purple",
"Set 5" = "orange",
"Sample" = "firebrick2")) +
theme_bw() -> plots.rf
### USING PREDICTIVE MEAN MATCHING
#imputed_dat.pmm <- mice(titanic, m=20, method='pmm') [DO NOT RERUN]
#saveRDS(imputed_dat.pmm, "Kaggle/imputed_dat.pmm.rds")
# Load Data
imputed_dat.pmm <- readRDS("Kaggle/imputed_dat.pmm.rds")
# Plot Data
imputed.age.pmm <- as_tibble(imputed_dat.pmm$imp$Age)
ggplot() +
# Imputed Distributions
geom_density(aes(x = imputed.age.pmm$'1'), color = "blue") +
geom_density(aes(x = imputed.age.pmm$'2'), color = "deeppink2") +
geom_density(aes(x = imputed.age.pmm$'3'), color = "green") +
geom_density(aes(x = imputed.age.pmm$'4'), color = "purple") +
geom_density(aes(x = imputed.age.pmm$'5'), color = "orange") +
# Overlay with Sample Age Distribution
geom_density(aes(x = titanic$Age), color="firebrick2",
size = 1.2,
linetype = 'dashed') +
labs(title = "Imputed Sets (PMM Method)",
x = "Age",
y = "Density") +
theme_bw() -> plots.pmm
plots.pmm+plots.rf
The imputed results are wildly off the mark when using pmm (I actually
ran 20 of these, they were all bad), so I’ll proceed with the rf
alternative.
For the imputed results using the random forest method, most of the
imputed data have greater density around ages 20 to 30 than the sample
distribution, and slightly less between 40 to 60, which is what we want
to see, since we know 3rd Class passengers show greater density in the
20 to 30 age bracket.
However, this is not the case for Set 4 (purple), which fits too neatly
with the sample curve, which is not good.
Next I will:
- Check the boxplots for each imputed data set for the 18 subgroups.
We’re mainly interested in the subgroups with the largest counts (for
low counts the boxplots do not return reliable ranges). These should
approximate the distribution in the remaining sample.
- Check for abnormal imputed values for the subgroups with low total
counts and high proportion of NAs.
- Check the distribution of imputed values per Title. There should be no
‘Masters’ with age higher than around 15, a very low count of ‘Misses’
above 40, and no Mr and Mrs under the age of about 15.
# BOXPLOT DISTRIBUTIONS for the 5 different imputed data sets
# Subset of 177 rows with Age NAs
titanic.is.na <- titanic %>%
filter(Age.NA == 1)
### GENERATE 5 DATASETS
for (i in 1:5) {
assign(paste0("titanic.na.set", i), # generate new object name
titanic.is.na %>%
mutate(Age = imputed.age.rf[[as.character(i)]]))
}
### GENERATE 5 BOXPLOTS
NA.Set_list <- list() # Initialize List
for (i in 1:5) {
dataset <- paste0("titanic.na.set", i)
plot_name <- paste0("NA.Set_", i)
title_name <- paste0("Imputed Values, Set ", i)
# Plot Boxplot
plot <- get(dataset) %>%
ggplot(aes(x = interaction(Embarked, Sex, Pclass),
y = Age, fill = factor(Pclass))) +
geom_boxplot() +
labs(x = "Subgroups",
y = "Age",
title = title_name) +
scale_fill_manual(values = c("firebrick2",
"cadetblue3",
"darkolivegreen3"),
name = "Pclass") +
theme_bw() +
coord_flip()
# Create Object
assign(plot_name, plot)
# Add to List
NA.Set_list[[i]] <- plot
}
# Create Grid
grid.arrange(age.groups, NA.Set_1, NA.Set_2,
NA.Set_3, NA.Set_4, NA.Set_5,
ncol = 2)
Identifying Oddities:
a) S.female.3: All imputed sets have consistently low IQR and
medians for this subgroup, compared to the sample. This category only
has 12 cases, so I would actually expect higher variability than that.
So there’s likely a reason for this pattern;
b) C.female.3 With the exception of Set 4, the 7 imputed values
for this category have higher IQRs than the sample.
Analysing Oddities:
### INSPECTING ODDITIES
# Inspecting S.female.3
titanic.is.na %>%
filter(Sex=='female', Pclass==3, Embarked=='S') %>%
view()
# Inspecting C.female.3
titanic.is.na %>%
filter(Sex=='female', Pclass==3, Embarked=='C') %>%
view()
a) S.female.3 has 9 cases of ‘Miss’, out of 12. It also has fairly high
counts on average for variable SipSp. I haven’t addressed this variable
yet, but it is the count for Siblings or Spouses. So one can see how
cases with high values (higher than one, at least), will correlate with
being younger. The random forest process caught that pattern, and this
explains the lower than expected IQR and median.
b) The sample age distribution for C.female.3 has a lower than average
age, but among the NAs there’s only 2 cases of Miss, and it includes 2
mothers. So this adequately explains the higher IQR.
Next I take the same approach to verify there are no odd imputed values
in the distributions of Age per Title. If there are, it needs to be
addressed.
### GENERATE 5 DENSITY PLOTS
Title.Set_list <- list() # Initialize List
for (i in 1:5) {
dataset <- paste0("titanic.na.set", i)
plot_name <- paste0("Title.Set_", i)
title_name <- paste0("Imputed Values per Title, Set ", i)
# Plot Boxplot
plot <- get(dataset) %>%
filter(!Title %in% 'High_Status') %>%
ggplot(aes(Age, color = Title)) +
geom_density(alpha = 0.6) +
labs(x = "Age", y = "Density",
title = title_name) +
scale_color_manual(values =
c("firebrick2",
"cadetblue3",
"darkolivegreen3",
"orange")) +
theme_bw()
# Create Object
assign(plot_name, plot)
# Add to List
Title.Set_list[[i]] <- plot
}
# Create Grid
grid.arrange(title.dist, Title.Set_1, Title.Set_2,
Title.Set_3, Title.Set_4, Title.Set_5,
ncol = 2)
Only Set 2 shows an adequate curve for the age distribution of ‘Master’,
all others have varying degrees of inadequacy. Given there aren’t too
many occurrences in the first place, the simpler approach will be to
replace these cases with the mean age of Master, which I’ll apply to
cases older than 13, since the oldest recorded age for Master is 12 and
there are no male records at 13 (but at 14, they are all ‘Mr’).
The distributions of age for ‘Miss’ are acceptable. Some sets have some
occurrences at ages between 50 and 60, but not sufficient to be an
issue.
The curves for ‘Mr’ appropriately lack cases at ages under 15, and peak
at around 25 to 30.
Finally, the imputed values for ‘Mrs’ aren’t great, with the exception
of Set 2, because they occur in ages under 15. Since the youngest
married woman in the data is 14 years old, I’ll change the imputation
method for ages under 14, by replacing them with the mean.
### MASTER: Replace Ages >13 for the mean
for (i in 1:5) {
# Generate string
set_name <- paste0("titanic.na.set", i)
# Get Set with string
set_data <- get(set_name)
# Replace Ages > 13 for the mean of Master
set_data <- set_data %>%
mutate(Age = ifelse(Title == 'Master' & Age > 13,
round(
mean(titanic$Age[titanic$Title=='Master'],
na.rm = T), 1),
Age))
# Assign back to original name
assign(set_name, set_data)
}
### MRS: Replace Ages <14 for the mean
for (i in 1:5) {
# Generate string
set_name <- paste0("titanic.na.set", i)
# Get Set with string
set_data <- get(set_name)
# Replace Ages > 13 for the mean of Master
set_data <- set_data %>%
mutate(Age = ifelse(Title == 'Mrs' & Age < 14,
round(
mean(titanic$Age[titanic$Title=='Mrs'],
na.rm = T), 1),
Age))
# Assign back to original name
assign(set_name, set_data)
}
### Plot Density Curves with the updated data
Title.Set_list <- list() # Initialize List
for (i in 1:5) {
dataset <- paste0("titanic.na.set", i)
plot_name <- paste0("Title.Set_", i)
title_name <- paste0("Imputed Values per Title, Set ", i)
# Plot Boxplot
plot <- get(dataset) %>%
filter(!Title %in% 'High_Status') %>%
ggplot(aes(Age, color = Title)) +
geom_density(alpha = 0.6) +
labs(x = "Age", y = "Density",
title = title_name) +
scale_color_manual(values =
c("firebrick2",
"cadetblue3",
"darkolivegreen3",
"orange")) +
theme_bw()
# Create Object
assign(plot_name, plot)
# Add to List
Title.Set_list[[i]] <- plot
}
# Create Grid
grid.arrange(title.dist, Title.Set_1, Title.Set_2,
Title.Set_3, Title.Set_4, Title.Set_5,
ncol = 2)
The new density curves with the reworked imputation now show no
occurrences of Master above age 13 and no occurrences of Mrs under age
14. The results are much better.
Of all of the sets, only Set 4 looked like it could be inadequate
because the density curve was too approximate to the sample curve. I was
afraid the biases weren’t being reflected, but further inspection didn’t
reveal anything out of the ordinary. Therefore I’ll be pooling from all
5 sets when adding Age to the model in the next part.
Before proceeding, I’ll create 5 titanic datasets (titanic.1, titanic.2,
etc) with the new imputed values.
### Creating 5 titanic datasets with imputed data
for (i in 1:5) {
# Get imputed sets
imputed_set <- get(paste0("titanic.na.set", i))
# Assign transformation to new object named 'titanic.x'
assign(paste0("titanic.", i),
# Select Key and Age variable from imputed set
titanic %>%
left_join(imputed_set %>%
select(PassengerId, Age),
# Key
by = "PassengerId",
# New 'Age.imp' column from imputed set
suffix = c("", ".imp")) %>%
# Coalesce NAs of Age with Age.imp
mutate(Age = coalesce(Age.imp, Age)) %>%
# Drop superfluous Age.imp column
select(-Age.imp)
)
}
# Verify. Check manually, no point in for loop
all.equal(titanic.1$Age[titanic.1$Age.NA==1],
titanic.na.set1$Age)
## [1] TRUE
I’ll omit the code in this section as it’s just a repeat of the previous
one and show only some useful summaries.
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.17 21.00 27.00 30.27 39.00 76.00 86
##
## N= 418
Like the main dataset, the test dataset is missing 20% of data in
variable Age.
I’ve tidied the variable like in the main set and also created a new
‘Title’ variable. I also looked through the overall age distribution and
subgroup distributions in the test sample, and there’s nothing terribly
misaligned with the main sample.
Next I use the same imputation approach to generate 5 sets of NAs for
the test data, and create 5 new test data sets, named test.1, test.2,
etc. I’ve omitted the process and show only the final density
curves.
##
## Set 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.50 21.00 24.50 27.29 31.50 64.00
##
## Set 2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 22.00 27.00 29.55 32.75 76.00
##
## Set 3
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.50 19.25 24.00 25.41 29.75 60.00
##
## Set 4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.50 21.00 25.00 27.51 30.00 64.00
##
## Set 5
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 22.00 28.50 30.51 37.50 76.00
In this section I’ll use a single dataset, titanic.1, to verify which
model is the best fit. First I’ll compare the linear, exponential and
piecewise approaches addressed in section ‘Baseline Predictive Models’,
choose the best out of the three, then convert age into a categorical
variable and see if it performs any better.
Previously, when modelling the relationship P(Survived) ~ Age, the
piecewise model proved to be significantly better than the exponential
model, and this in turn was significantly better than the linear model.
I’ll first check if that’s still the case when we add Age to the main
model.
# Load data for piecewise model
# titanic.piece is identical to titanic.1, but has a Piecewise
# variable. Process is shown in 'Baseline Predictive Models'
titanic.piece <- read.csv("titanic.piece.csv")
# Linear Model
age.linear <- glm(formula = Survived ~
Sex_F + Pclass +
Sex_F * Pclass +
Sex_F * Embark_Q +
Embark_C +
Age * Sex_F,
family = binomial(), data = titanic.1)
# Exponential Model
age.exponential <- glm(formula = Survived ~
Sex_F + Pclass +
Sex_F * Pclass +
Sex_F * Embark_Q +
Embark_C +
log(Age) * Sex_F,
family = binomial(), data = titanic.1)
# Piecewise Model
age.piecewise <- glm(formula = Survived ~
Sex_F + Pclass +
Sex_F * Pclass +
Sex_F * Embark_Q +
Embark_C +
Piecewise * Sex_F,
family = binomial(), data = titanic.piece)
anova(gen.cl.emb.model, age.linear, age.exponential, age.piecewise)
## Analysis of Deviance Table
##
## Model 1: Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F * Embark_Q +
## Embark_C
## Model 2: Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F * Embark_Q +
## Embark_C + Age * Sex_F
## Model 3: Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F * Embark_Q +
## Embark_C + log(Age) * Sex_F
## Model 4: Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F * Embark_Q +
## Embark_C + Piecewise * Sex_F
## Resid. Df Resid. Dev Df Deviance
## 1 884 785.10
## 2 882 754.25 2 30.8541
## 3 882 742.10 0 12.1487
## 4 882 746.47 0 -4.3701
Adding Age to the model significantly reduces the amount of unexplained
deviance from 785 to 754, and then again to 742 if we instead assume an
exponential relationship. But unlike the base model, splitting variable
Age into three sections prior to adding it to the model did not improve
it. It seems to be worse than just using log(Age).
Whatever extra variability the piecewise model was explaining when
predicting P(Survived) exclusively on Age, it seems to be captured by
the already existing predictors in the model.
So the ‘age.exponential’ model is the best alternative of the three when
treating Age as a continuous variable. But I still want to compare this
model with another which treats Age as a categorical variable.
# Create new 'AgeCategory' variable
titanic.temp <- titanic.1 %>%
mutate(
AgeCategory = case_when(
Age < 2 ~ 'Babies',
Age >= 2 & Age < 16 ~ 'Children',
Age >= 16 & Age < 22 ~ 'Young Adult',
Age >= 22 & Age < 60 ~ 'Adult',
Age >= 60 ~ 'Elderly',
TRUE ~ NA_character_) %>%
factor(levels = c('Adult', 'Elderly','Young Adult', 'Children', 'Babies')) %>%
# Set Adult as base level
relevel(ref = 'Adult'))
cat('\nCounts and Survival Proportion per Age Category\n\n')
round(tapply(titanic.temp$Survived, titanic.temp$AgeCategory,mean),2)
cat('\n')
summary(titanic.temp$AgeCategory)
# Age as Category Model
age.category <- glm(formula = Survived ~
Sex_F + Pclass +
Sex_F * Pclass +
Sex_F * Embark_Q +
Embark_C +
AgeCategory*Sex_F,
family = binomial(), data = titanic.temp)
cat('\n\n\n')
anova(age.exponential,age.category)
cat('\n')
cat('\nage.exponential AIC:',age.exponential$aic)
cat('\nage.category AIC:',age.category$aic)
##
## Counts and Survival Proportion per Age Category
##
## Adult Elderly Young Adult Children Babies
## 0.38 0.27 0.32 0.50 0.76
##
## Adult Elderly Young Adult Children Babies
## 607 30 155 82 17
##
##
##
## Analysis of Deviance Table
##
## Model 1: Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F * Embark_Q +
## Embark_C + log(Age) * Sex_F
## Model 2: Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F * Embark_Q +
## Embark_C + AgeCategory * Sex_F
## Resid. Df Resid. Dev Df Deviance
## 1 882 742.10
## 2 876 740.36 6 1.7415
##
##
## age.exponential AIC: 760.0974
## age.category AIC: 770.3559
Coding variable Age as categorical does not offer any advantage, and it
increases the AIC from 760 to 770.
Using log(Age) and an interaction term “log(Age) x Sex” provides the
best results. This is likely due to the huge difference in survivability
at very young ages for men.
Adding a term “log(Age) x factor(Pclass)” did further reduce the amount
of unexplained deviance, but the model looks bloated and overall harder
to interpret.
The coefficients for the model ‘age.exponential’ can be seen below. I’ll
also look at the predicted probabilities for groups of interest, to make
sure I’m happy with the fit of the model, and analyse the diagnostic
data.
summary(age.exponential)
##
## Call:
## glm(formula = Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F *
## Embark_Q + Embark_C + log(Age) * Sex_F, family = binomial(),
## data = titanic.1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.1722 0.6770 4.686 2.79e-06 ***
## Sex_F 3.3865 1.3061 2.593 0.009520 **
## Pclass -0.8081 0.1436 -5.627 1.83e-08 ***
## Embark_Q -0.6224 0.6401 -0.972 0.330846
## Embark_C 0.6477 0.2341 2.766 0.005671 **
## log(Age) -0.9164 0.1550 -5.912 3.37e-09 ***
## Sex_F:Pclass -1.4341 0.3574 -4.013 6.01e-05 ***
## Sex_F:Embark_Q 1.9855 0.7742 2.565 0.010330 *
## Sex_F:log(Age) 0.8512 0.2306 3.690 0.000224 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.7 on 890 degrees of freedom
## Residual deviance: 742.1 on 882 degrees of freedom
## AIC: 760.1
##
## Number of Fisher Scoring iterations: 6
# New dataset with fitted and diagnostic data (See Custom Functions)
model_tib(age.exponential, titanic.1, PassengerId, Survived, Sex, Pclass,Embarked, Age)
# Sample Proportion vs Model Predictions (Sex)
age.exponential_tib %>%
ggplot(aes(Age)) +
# Sample Survivability
geom_smooth(span = 0.8, se = F,
aes(y = Survived,
color = interaction(Sex, "Sample"),
linetype = "Sample")) +
# Model Prediction
geom_smooth(span = 0.8, se = F,
aes(y = predicted.prob,
color = interaction(Sex, "Pred"),
linetype = "Pred")) +
# Labels & Theme
labs(x = "Age", y = "P(Survived)",
title = 'Sample vs Model Predictions (Sex)') +
scale_x_continuous(expand = c(0, 0)) + # removes padding
coord_cartesian(ylim = c(0,1)) +
scale_linetype_manual(values = c("Sample" = "dashed",
"Pred" = "solid"),
name = 'Linetype',
guide = guide_legend(
override.aes = list(color = "grey30"))) +
scale_color_manual(values = c('male.Pred' = 'royalblue3',
'female.Pred' = 'firebrick2',
'male.Sample' = 'lightblue',
'female.Sample' = 'lightcoral'),
name = "Groups") +
theme_bw() -> fitted.sex
# Sample Proportion vs Model Predictions (Pclass)
age.exponential_tib %>%
ggplot(aes(Age)) +
# Sample Survivability
geom_smooth(span = 0.8, se = F,
aes(y = Survived,
color = interaction(Pclass, "Sample"),
linetype = "Sample")) +
# Model Prediction
geom_smooth(span = 0.8, se = F,
aes(y = predicted.prob,
color = interaction(Pclass, "Pred"),
linetype = "Pred")) +
# Labels & Theme
labs(x = "Age", y = "P(Survived)",
title = 'Sample vs Model Predictions (Pclass)') +
scale_x_continuous(expand = c(0, 0)) + # removes padding
coord_cartesian(ylim = c(0,1)) +
scale_linetype_manual(values = c("Sample" = "dashed",
"Pred" = "solid"),
name = 'Linetype',
guide = guide_legend(
override.aes = list(color = "grey30"))) +
scale_color_manual(values = c('1.Pred' = 'firebrick2',
'2.Pred' = 'royalblue3',
'3.Pred' = 'darkolivegreen3',
'1.Sample' = 'lightcoral',
'2.Sample' = 'lightblue',
'3.Sample' = 'lightgreen'),
name = 'Groups') +
theme_bw() -> fitted.pclass
fitted.sex
fitted.pclass
In this section I’ll fit the age.exponential model on each of the 5
titanic.x training sets, and then predict survival probabilities for
each of the 5 test sets using these 5 models.
This will return a dataset with 25 sets of predictions, from where I’ll
obtain the final prediction by averaging each row.
(Note: I’m still learning multiple imputation and I was getting
errors when attempting to apply the pooled model to the test sets. My
intuition tells me the approach above should produce final reasonable
estimates for each row).
# Fit the regression model on the 5 titanic sets
age.models_list <- list()
for (i in 1:5) {
current_data <- get(paste0("titanic.", i))
# Model
model <- glm(formula = Survived ~
Sex_F + Pclass +
Sex_F * Pclass +
Sex_F * Embark_Q +
Embark_C +
log(Age) * Sex_F,
family = binomial(),
data = current_data)
# Add to List
age.models_list[[i]] <- model
}
anova(age.models_list[[1]],
age.models_list[[2]],
age.models_list[[3]],
age.models_list[[4]],
age.models_list[[5]])
# Pool models [mice::pool works without 'mids' object?]
pooled_age.model <- pool(age.models_list)
pooled_age.model2 <- MIcombine(age.models_list) # same result
# Summary of the pooled model
cat('\n\nSummary of the pooled model\n')
summary(pooled_age.model)
### Use each model on each test set and save data in new dataset
# Initialize new dataset
all_predictions <- test.1 %>%
select(PassengerId, Age.NA) %>%
mutate(Pred_Prob = NA_real_)
#
for (i in 1:5) {
for (j in 1:5) {
# Apply each model i to each test j
pred <- predict(age.models_list[[i]],
newdata = get(paste0("test.", j)),
type = "response")
# Attach each set of predictions with names in format "m_i_t_j"
pred_tibble <- tibble(!!paste0("m_", i, "_t_", j) := pred)
#
all_predictions <- bind_cols(all_predictions, pred_tibble)
}
}
# Obtain the average prediction from all 25
all_predictions <- all_predictions %>%
# Calculate row means. Exclude PassengerId and Pred_Prob columns
mutate(Pred_Prob = rowMeans(.[-c(1,2,3)]))
# view imputed values and row average
cat('\n\nExcerpt of predicted values and Row Average\n')
all_predictions %>%
select(2:8) %>%
filter(Age.NA == 1) %>%
head(10)
## Analysis of Deviance Table
##
## Model 1: Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F * Embark_Q +
## Embark_C + log(Age) * Sex_F
## Model 2: Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F * Embark_Q +
## Embark_C + log(Age) * Sex_F
## Model 3: Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F * Embark_Q +
## Embark_C + log(Age) * Sex_F
## Model 4: Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F * Embark_Q +
## Embark_C + log(Age) * Sex_F
## Model 5: Survived ~ Sex_F + Pclass + Sex_F * Pclass + Sex_F * Embark_Q +
## Embark_C + log(Age) * Sex_F
## Resid. Df Resid. Dev Df Deviance
## 1 882 742.10
## 2 882 746.57 0 -4.4773
## 3 882 744.52 0 2.0597
## 4 882 749.57 0 -5.0510
## 5 882 747.28 0 2.2858
##
##
## Summary of the pooled model
## term estimate std.error statistic df p.value
## 1 (Intercept) 2.9513632 0.6821926 4.3262903 469.4213 1.853881e-05
## 2 Sex_F 3.8336793 1.3573687 2.8243464 434.7036 4.955647e-03
## 3 Pclass -0.7946385 0.1437348 -5.5285055 847.4303 4.301709e-08
## 4 Embark_Q -0.5753284 0.6399286 -0.8990509 860.2119 3.688771e-01
## 5 Embark_C 0.6332100 0.2346043 2.6990550 866.7509 7.089021e-03
## 6 log(Age) -0.8548462 0.1582588 -5.4015714 252.5451 1.525370e-07
## 7 Sex_F:Pclass -1.4703552 0.3594037 -4.0910965 858.6157 4.697605e-05
## 8 Sex_F:Embark_Q 1.9499496 0.7733366 2.5214759 872.8985 1.186338e-02
## 9 Sex_F:log(Age) 0.7335655 0.2491613 2.9441392 171.7648 3.687037e-03
##
##
## Excerpt of predicted values and Row Average
## Age.NA Pred_Prob m_1_t_1 m_1_t_2 m_1_t_3 m_1_t_4 m_1_t_5
## 1 1 0.09719959 0.09342913 0.08104988 0.07336536 0.13000841 0.09957673
## 2 1 0.98324591 0.98388575 0.98260134 0.98322545 0.98364417 0.98317645
## 3 1 0.16984356 0.15169732 0.17992522 0.19199265 0.22214448 0.10069847
## 4 1 0.39119710 0.40597388 0.40086314 0.40597388 0.40086314 0.39188027
## 5 1 0.41551464 0.40860545 0.40425658 0.44035607 0.40933922 0.40860545
## 6 1 0.11003602 0.09957673 0.11484766 0.10298163 0.12450611 0.10298163
## 7 1 0.36268642 0.25707512 0.35760214 0.39508104 0.44217169 0.37534048
## 8 1 0.05414810 0.03634358 0.04519192 0.06254922 0.05077608 0.05803343
## 9 1 0.29770708 0.35745972 0.34772841 0.29942275 0.21332484 0.27439826
## 10 1 0.08522934 0.07700698 0.08556564 0.11484766 0.06706562 0.07006728
Finally, I’ll add the column Pred_Prob to the ‘titanic.test’ dataset and
predict a binary survival outcome, before saving and submitting a new
file to Kaggle:
### Add final prediction to the original test.set, predict binary outcome
# Remove previous Pred_Prob prior to left_join
titanic.test <- titanic.test %>%
select(-Pred_Prob) %>%
left_join(all_predictions %>%
select(PassengerId, Pred_Prob), by = "PassengerId")
# Predict Binary Outcome
titanic.test$Survived <- ifelse(titanic.test$Pred_Prob > 0.5, 1, 0)
# Extract and Save
age.exp.model_sub <- titanic.test %>%
select(PassengerId, Survived)
# write_csv(age.exp.model_sub, "Kaggle/age.exp.model_sub.csv")
The custom function ‘model_tib’ creates a dataset with the fitted values
along with diagnostic data for that model. I’ll address them in this
section.
cat('\nChecking for Outliers\n')
cat('\nStandardized Residuals > |1.96| = ',
age.exponential_tib %>%
filter(abs(standardized.res) > 1.96) %>%
nrow())
cat('\nProportion of Std. Res.> |1.96| =', round(41/891*100,1))
cat('\n')
cat('\nStandardized Residuals > |2.58| = ',
age.exponential_tib %>%
filter(abs(standardized.res) > 2.58) %>%
nrow())
cat('\nProportion of Std. Res.> |2.58| =', round(3/891*100,1))
cat('\n')
cat('\nStandardized Residuals > |3.29| = ',
age.exponential_tib %>%
filter(abs(standardized.res) > 3.29) %>%
nrow())
cat('\nProportion of Std. Res.> |3.29| =', round(0/891*100,1))
cat('\n\n')
cat('\nChecking for Influential Cases\n')
cat('\nCases of Cook\'s Distance > 1 =',
age.exponential_tib %>%
filter(cookd > 1) %>%
nrow())
cat('\nHighest Leverage=',
age.exponential_tib$leverage %>%
max() %>%
round(.,3))
cat('\n\n')
cat('\nChecking Model Diagnostics\n')
cat('\nVIFs\n')
rms::vif(age.exponential)
cat('\nModel Fit:\n')
cat('\nBefore adding Age\n')
log_pseudoR2s(gen.cl.emb.model)
cat('\nAfter adding Age\n')
log_pseudoR2s(age.exponential)
##
## Checking for Outliers
##
## Standardized Residuals > |1.96| = 41
## Proportion of Std. Res.> |1.96| = 4.6
##
## Standardized Residuals > |2.58| = 3
## Proportion of Std. Res.> |2.58| = 0.3
##
## Standardized Residuals > |3.29| = 0
## Proportion of Std. Res.> |3.29| = 0
##
##
## Checking for Influential Cases
##
## Cases of Cook's Distance > 1 = 0
## Highest Leverage= 0.113
##
##
## Checking Model Diagnostics
##
## VIFs
## Sex_F Pclass Embark_Q Embark_C log(Age)
## 46.605341 1.645388 3.475462 1.066858 2.106468
## Sex_F:Pclass Sex_F:Embark_Q Sex_F:log(Age)
## 27.733627 3.726957 14.114168
##
## Model Fit:
##
## Before adding Age
## Pseudo R^2 for logistic regression
## Hosmer and Lemeshow R^2 0.338
## Cox and Snell R^2 0.363
## Nagelkerke R^2 0.493
##
## After adding Age
## Pseudo R^2 for logistic regression
## Hosmer and Lemeshow R^2 0.375
## Cox and Snell R^2 0.393
## Nagelkerke R^2 0.534
I’ll update this section as I add predictors to the model.
lb.subset %>% # Histogram of Scores between 0.700 and 0.825
ggplot(aes(Score)) +
geom_density() +
scale_x_continuous(breaks = seq(0.700, 0.825, by = 0.0125)) +
geom_vline(xintercept = 0.76555, # Gender Model score
linetype = "dashed", color = "firebrick2", size = 1) +
geom_vline(xintercept = 0.78468, # Age Model Score
linetype = "dashed", color = "royalblue3", size = 1) +
annotate("text",
x = 0.76555,
y = 31.5, label = "Gender model\nbaseline score",
hjust = 1.05, color = "firebrick2") +
annotate("text",
x = 0.78468,
y = 31.5, label = "Current model \n score",
hjust = -0.05, color = "royalblue3") +
geom_point(aes(x = 0.7874629, y = 20),
color = "darkolivegreen4", size = 2) +
annotate("text",
x = 0.7874629,
y = 20, label = "Personal Goal",
hjust = -0.05, color = "darkolivegreen4") +
labs(x = "Score", y = "Count",
title = 'Kaggle Leaderboard, Excluding Tutorial Submissions (Estimate)') +
theme_bw()